24#include "llvm/IR/IntrinsicsNVPTX.h"
35#define DEBUG_TYPE "nvptx-isel"
36#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
40 cl::desc(
"Enable reciprocal sqrt optimization"));
47 cl::desc(
"Enable MAD wide optimization"));
76NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
80bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
84bool NVPTXDAGToDAGISel::useF32FTZ()
const {
85 return Subtarget->getTargetLowering()->useF32FTZ(*
MF);
88bool NVPTXDAGToDAGISel::allowFMA()
const {
89 const NVPTXTargetLowering *TL =
Subtarget->getTargetLowering();
93bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
95bool NVPTXDAGToDAGISel::doMADWideOpt()
const {
return EnableMADWide; }
99void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
101 if (
N->isMachineOpcode()) {
106 switch (
N->getOpcode()) {
126 if (tryEXTRACT_VECTOR_ELEMENT(
N))
133 SelectSETP_BF16X2(
N);
138 if (tryLoadVector(
N))
149 if (tryStoreVector(
N))
153 if (tryIntrinsicChain(
N))
157 if (tryIntrinsicVoid(
N))
168 SelectAddrSpaceCast(
N);
171 if (
N->getOperand(1).getValueType() == MVT::i128) {
172 SelectV2I64toI128(
N);
178 if (
N->getOperand(1).getValueType() == MVT::i128) {
179 SelectI128toV2I64(
N);
186 selectAtomicSwap128(
N);
191 if (tryBF16ArithToFMA(
N))
195 return selectBR_JT(
N);
202#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
203 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
204 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
220 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
222 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
234 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
236 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
246 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
248 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
262 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
264 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
278 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
280 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
286void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
289 "tcgen05.ld is not supported on this architecture variant");
296 auto OffsetNode =
CurDAG->getTargetConstant(
300 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
305 {N->getOperand(2), N->getOperand(0)}));
309bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
310 unsigned IID =
N->getConstantOperandVal(1);
314 case Intrinsic::nvvm_ldu_global_f:
315 case Intrinsic::nvvm_ldu_global_i:
316 case Intrinsic::nvvm_ldu_global_p:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
324 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
325 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
326 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
331 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
332 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
334 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
337 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
338 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
339 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
345 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
346 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
347 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
357 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
358 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
359 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
360 SelectTcgen05Ld(
N,
true);
395 return CmpMode::NotANumber;
410 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
413bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
416 SDNode *SetP =
CurDAG->getMachineNode(
417 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
418 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
419 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
424bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
427 SDNode *SetP =
CurDAG->getMachineNode(
428 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
429 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
430 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
435bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
437 MVT EltVT =
N->getSimpleValueType(0);
440 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(
N), EltVT, EltVT,
Vector);
448bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
451 MVT VT =
Vector.getSimpleValueType();
457 Opcode = NVPTX::I32toV2I16;
459 Opcode = NVPTX::I64toV2I32;
465 for (
auto *U :
Vector.getNode()->users()) {
468 if (
U->getOperand(0) !=
Vector)
470 if (
const ConstantSDNode *IdxConst =
472 if (IdxConst->getZExtValue() == 0)
474 else if (IdxConst->getZExtValue() == 1)
490 CurDAG->getMachineNode(Opcode, SDLoc(
N), EltVT, EltVT,
Vector);
491 for (
auto *Node : E0)
493 for (
auto *Node : E1)
499static std::optional<NVPTX::AddressSpace>
convertAS(
unsigned AS) {
521 return convertAS(
N->getMemOperand()->getAddrSpace())
529 auto Ordering =
N->getMergedOrdering();
553 return Scopes[
N->getSyncScopeID()];
558struct OperationOrderings {
559 NVPTX::Ordering InstructionOrdering, FenceOrdering;
560 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
561 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
562 : InstructionOrdering(IO), FenceOrdering(FO) {}
565static OperationOrderings
667 !HasMemoryOrdering) {
669 formatv(
"PTX does not support \"atomic\" for orderings different than"
670 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
682 bool AddrGenericOrGlobalOrShared =
687 if (!AddrGenericOrGlobalOrShared)
690 bool UseRelaxedMMIO =
712 formatv(
"PTX only supports Acquire Ordering on reads: {}",
713 N->getOperationName()));
718 formatv(
"PTX only supports Release Ordering on writes: {}",
719 N->getOperationName()));
723 formatv(
"NVPTX does not support AcquireRelease Ordering on "
725 "yet and PTX does not support it on loads or stores: {}",
726 N->getOperationName()));
739 else if (
N->writeMem())
743 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
744 "read-modify-writes yet: {}",
745 N->getOperationName()));
746 return OperationOrderings(InstrOrder,
751 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
774 auto S = Scopes[
N->getSyncScopeID()];
783 Subtarget->failIfClustersUnsupported(
"cluster scope");
802 T->failIfClustersUnsupported(
".cluster scope fence");
805 if (!
T->hasSplitAcquireAndReleaseFences() &&
813 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
814 : NVPTX::INT_MEMBAR_SYS;
816 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
817 : NVPTX::INT_MEMBAR_CTA;
819 return NVPTX::atomic_thread_fence_acquire_cluster;
821 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
822 : NVPTX::INT_MEMBAR_GL;
826 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
833 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
834 : NVPTX::INT_MEMBAR_SYS;
836 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
837 : NVPTX::INT_MEMBAR_CTA;
839 return NVPTX::atomic_thread_fence_release_cluster;
841 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
842 : NVPTX::INT_MEMBAR_GL;
846 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
853 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
854 : NVPTX::INT_MEMBAR_SYS;
856 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
857 : NVPTX::INT_MEMBAR_CTA;
859 return NVPTX::atomic_thread_fence_acq_rel_cluster;
861 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
862 : NVPTX::INT_MEMBAR_GL;
866 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
874 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
875 : NVPTX::INT_MEMBAR_SYS;
877 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
878 : NVPTX::INT_MEMBAR_CTA;
880 return NVPTX::atomic_thread_fence_seq_cst_cluster;
882 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
883 : NVPTX::INT_MEMBAR_GL;
896 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
897 OrderingToString(O), ScopeToString(S)));
905std::pair<NVPTX::Ordering, NVPTX::Scope>
906NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
908 auto [InstructionOrdering, FenceOrdering] =
910 auto Scope = getOperationScope(
N, InstructionOrdering);
923 formatv(
"Unexpected fence ordering: \"{}\".",
926 return {InstructionOrdering,
Scope};
929void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
935 assert(SrcAddrSpace != DstAddrSpace &&
936 "addrspacecast must be between different address spaces");
941 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
944 SDNode *Cvt =
CurDAG->getMachineNode(NVPTX::CVT_u64_u32,
DL, MVT::i64,
950 switch (SrcAddrSpace) {
953 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
956 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
961 "Shared cluster address space is only supported in 64-bit mode");
962 Opc = NVPTX::cvta_shared_cluster_64;
965 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
968 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
971 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
978 if (SrcAddrSpace != 0)
981 switch (DstAddrSpace) {
984 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
987 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
992 "Shared cluster address space is only supported in 64-bit mode");
993 Opc = NVPTX::cvta_to_shared_cluster_64;
996 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
999 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
1002 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1006 SDNode *CVTA =
CurDAG->getMachineNode(
Opc,
DL,
N->getValueType(0), Src);
1007 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1010 CVTA =
CurDAG->getMachineNode(NVPTX::CVT_u32_u64,
DL, MVT::i32,
1021static std::optional<unsigned>
1023 std::optional<unsigned> Opcode_i32,
1024 std::optional<unsigned> Opcode_i64) {
1043 return std::nullopt;
1048 return V.getOpcode() ==
ISD::ADD ||
1049 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1054 N =
N.getOperand(0);
1064 GA->getValueType(0), GA->getOffset(),
1065 GA->getTargetFlags());
1068 ES->getTargetFlags());
1077 APInt AccumulatedOffset(64u, 0);
1083 const APInt CI = CN->getAPIntValue().
sext(64);
1084 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1087 AccumulatedOffset += CI;
1113bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1115 assert(
LD->readMem() &&
"Expected load");
1119 if (PlainLoad && PlainLoad->
isIndexed())
1129 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1131 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1139 uint32_t UsedBytesMask;
1140 switch (
N->getOpcode()) {
1143 UsedBytesMask = UINT32_MAX;
1146 UsedBytesMask =
N->getConstantOperandVal(3);
1153 FromTypeWidth <= 128 &&
"Invalid width for load");
1158 getI32Imm(Scope,
DL),
1159 getI32Imm(CodeAddrSpace,
DL),
1160 getI32Imm(FromType,
DL),
1161 getI32Imm(FromTypeWidth,
DL),
1162 getI32Imm(UsedBytesMask,
DL),
1168 const std::optional<unsigned> Opcode =
1169 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1173 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1177 MachineMemOperand *MemRef =
LD->getMemOperand();
1185 switch (
N->getOpcode()) {
1197bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1205 const MVT EltVT =
LD->getSimpleValueType(0);
1208 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1218 const unsigned ExtensionType =
N->getConstantOperandVal(4);
1221 : NVPTX::PTXLdStInstCode::
Untyped;
1224 const uint32_t UsedBytesMask =
N->getConstantOperandVal(3);
1230 getI32Imm(Scope,
DL),
1231 getI32Imm(CodeAddrSpace,
DL),
1232 getI32Imm(FromType,
DL),
1233 getI32Imm(FromTypeWidth,
DL),
1234 getI32Imm(UsedBytesMask,
DL),
1239 std::optional<unsigned> Opcode;
1240 switch (
N->getOpcode()) {
1245 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1249 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1253 NVPTX::LDV_i32_v8, {});
1259 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1261 MachineMemOperand *MemRef =
LD->getMemOperand();
1268bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1271 unsigned ExtensionType;
1272 uint32_t UsedBytesMask;
1274 ExtensionType =
Load->getExtensionType();
1275 UsedBytesMask = UINT32_MAX;
1277 ExtensionType =
LD->getConstantOperandVal(4);
1278 UsedBytesMask =
LD->getConstantOperandVal(3);
1282 : NVPTX::PTXLdStInstCode::
Untyped;
1286 assert(!(
LD->getSimpleValueType(0).isVector() &&
1291 getI32Imm(FromTypeWidth,
DL),
1292 getI32Imm(UsedBytesMask,
DL),
1298 std::optional<unsigned> Opcode;
1299 switch (
LD->getOpcode()) {
1304 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1307 Opcode =
pickOpcodeForVT(TargetVT, std::nullopt, NVPTX::LD_GLOBAL_NC_i32,
1308 NVPTX::LD_GLOBAL_NC_i64);
1313 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1318 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1322 NVPTX::LD_GLOBAL_NC_v8i32, {});
1328 SDNode *NVPTXLDG =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1337 auto ElementBitWidth = TotalWidth / NumElts;
1339 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1340 "Invalid width for load");
1341 return ElementBitWidth;
1344bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1359 std::optional<unsigned> Opcode;
1360 switch (
N->getOpcode()) {
1365 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1369 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1373 NVPTX::LDU_GLOBAL_v4i32, {});
1379 SDNode *NVPTXLDU =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1385bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1387 assert(
ST->writeMem() &&
"Expected store");
1390 assert((PlainStore || AtomicStore) &&
"Expected store");
1393 if (PlainStore && PlainStore->
isIndexed())
1401 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1404 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1410 "Invalid width for store");
1414 getI32Imm(Ordering,
DL),
1415 getI32Imm(Scope,
DL),
1416 getI32Imm(CodeAddrSpace,
DL),
1417 getI32Imm(ToTypeWidth,
DL),
1422 const std::optional<unsigned> Opcode =
1424 NVPTX::ST_i32, NVPTX::ST_i64);
1428 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1433 MachineMemOperand *MemRef =
ST->getMemOperand();
1439bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1441 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1452 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1457 for (
auto &V :
ST->ops().slice(1, NumElts))
1458 Ops.push_back(selectPossiblyImm(V));
1460 const unsigned ToTypeWidth = TotalWidth / NumElts;
1463 TotalWidth <= 256 &&
"Invalid width for store");
1466 Ops.append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1467 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1471 ST->getOperand(1).getSimpleValueType().SimpleTy;
1472 std::optional<unsigned> Opcode;
1473 switch (
ST->getOpcode()) {
1493 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1495 MachineMemOperand *MemRef =
ST->getMemOperand();
1504bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1511 bool IsSigned =
false;
1527 uint64_t MaskVal =
Mask->getZExtValue();
1537 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1541 Val =
LHS.getNode()->getOperand(0);
1542 Start =
LHS.getNode()->getOperand(1);
1548 int64_t GoodBits =
Start.getValueSizeInBits() - StartVal;
1549 if (NumBits > GoodBits) {
1607 NumBits = NumZeros + NumOnes - ShiftAmt;
1613 if (ShiftAmt < NumZeros) {
1621 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1637 Val =
LHS->getOperand(0);
1656 if (OuterShiftAmt < InnerShiftAmt) {
1667 Start =
CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt,
DL,
1692 Opc = NVPTX::BFE_S32rii;
1694 Opc = NVPTX::BFE_U32rii;
1698 Opc = NVPTX::BFE_S64rii;
1700 Opc = NVPTX::BFE_U64rii;
1716bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1717 EVT VT =
SDValue(
N, 0).getValueType();
1721 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1738 auto API = APF.bitcastToAPInt();
1739 API = API.concat(API);
1741 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_B32_i,
DL, VT, Const),
1745 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_BF16_i,
DL, VT, Const), 0);
1748 switch (
N->getOpcode()) {
1751 Operands = {N0, GetConstant(1.0), N1};
1755 Operands = {N1, GetConstant(-1.0), N0};
1760 Operands = {N0, N1, GetConstant(-0.0)};
1766 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1767 MachineSDNode *
FMA =
CurDAG->getMachineNode(Opcode,
DL, VT, Operands);
1774 V =
V.getOperand(0);
1777 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1780 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1789 std::vector<SDValue> &OutOps) {
1790 switch (ConstraintID) {
1795 OutOps.push_back(
Base);
1796 OutOps.push_back(
Offset);
1803void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1822 NewOps[0] =
N->getOperand(0);
1825 if (
N->getNumOperands() == 5)
1826 NewOps[3] =
N->getOperand(4);
1832void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1849 SDNode *Mov =
CurDAG->getMachineNode(
1850 NVPTX::I128toV2I64,
DL,
1857bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1860 unsigned int FenceOp =
1862 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
1864 SDNode *FenceNode =
CurDAG->getMachineNode(FenceOp,
DL, MVT::Other, Chain);
1880 "NVPTXScopes::operator[]");
1882 auto S = Scopes.find(
ID);
1883 if (S == Scopes.end()) {
1884 auto scopeName = Context->getSyncScopeName(
ID);
1885 assert(scopeName.has_value() &&
"Scope name must exist.");
1889 for (
const auto &Entry : Scopes) {
1890 if (
auto name = Context->getSyncScopeName(Entry.first))
1895 formatv(
"NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
1896 "Supported syncscopes are: {2}.",
1897 scopeName.value(),
int(
ID),
1905#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1907 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1908 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1910#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1911 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1912 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1931 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1952 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1957void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
1964 size_t NumOps =
N->getNumOperands();
1965 size_t NumDims =
NumOps - 6;
1966 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 1) == 1;
1967 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
1971 Ops.push_back(getI32Imm(RedOp,
DL));
1972 Ops.push_back(
N->getOperand(0));
1977 NumDims, IsShared32, IsCacheHint, IsIm2Col);
1981#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1982 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1983 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1987 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1989 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1991 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
1993 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
1995 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
1997 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
1999 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2001 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2003 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2005 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2007 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2009 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2011 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2013 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2015 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2017 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2019 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2021 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2023 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2025 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2027 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2029 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2031 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2033 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2035 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2037 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2039 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2041 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2043 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2045 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2047 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2049 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2051 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2053 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2055 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2057 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2059 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2065void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2066 if (!
Subtarget->hasTcgen05InstSupport())
2068 "tcgen05.st is not supported on this architecture variant");
2082 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2091 DL,
N->getVTList(), Operands));
2094bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2095 unsigned IID =
N->getConstantOperandVal(1);
2097 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2101 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2102 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2103 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2104 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2105 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2106 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2108 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2109 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2110 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2111 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2114 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2115 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2119 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2121 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2122 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2123 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2124 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2127 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2128 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2132 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2134 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2135 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2136 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2137 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2140 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2141 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2145 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2147 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2148 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2149 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2150 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2153 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2154 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2158 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2160 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2163 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2166 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2171 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2176 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2184 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2189 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2195 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2197 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2202 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2206 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2207 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2208 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2209 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2210 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2211 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2212 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2213 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2214 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2215 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2216 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2217 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2218 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2219 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2220 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2221 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2222 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2223 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2224 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2225 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2226 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2227 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2228 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2229 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2230 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2231 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2232 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2233 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2234 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2239 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2240 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2241 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2242 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2243 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2244 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2245 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2246 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2247 SelectTcgen05St(
N,
true);
2253void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2260 Ops.append(
N->op_begin() + 2,
N->op_end());
2262 getI32Imm(getMemOrder(AN), dl),
2263 getI32Imm(getAtomicScope(AN), dl),
2271 ? NVPTX::ATOM_EXCH_B128
2272 : NVPTX::ATOM_CAS_B128;
2274 auto *ATOM =
CurDAG->getMachineNode(Opcode, dl,
N->getVTList(),
Ops);
2280void NVPTXDAGToDAGISel::selectBR_JT(
SDNode *
N) {
2282 "BR_JT should be expanded during legalization on unsupported targets");
2285 const SDValue InChain =
N->getOperand(0);
2289 unsigned JId =
JT->getIndex();
2290 MachineJumpTableInfo *MJTI =
CurDAG->getMachineFunction().getJumpTableInfo();
2296 MachineSDNode *Chain =
CurDAG->getMachineNode(
2297 NVPTX::BRX_START,
DL, {MVT::Other, MVT::Glue}, {IdV, InChain});
2302 Chain =
CurDAG->getMachineNode(
2303 NVPTX::BRX_ITEM,
DL, {MVT::Other, MVT::Glue},
2307 MachineSDNode *BrxEnd =
2308 CurDAG->getMachineNode(NVPTX::BRX_END,
DL, MVT::Other,
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static const fltSemantics & BFloat()
static constexpr roundingMode rmNearestTiesToEven
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const T & back() const
back - Get the last element.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
NVPTXDAGToDAGISel()=delete
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BR_JT
BR_JT - Jumptable branch.
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_SHARED_CLUSTER
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const