23#include "llvm/IR/IntrinsicsNVPTX.h"
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
39 cl::desc(
"Enable reciprocal sqrt optimization"));
68NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
72bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
76bool NVPTXDAGToDAGISel::useF32FTZ()
const {
77 return Subtarget->getTargetLowering()->useF32FTZ(*
MF);
80bool NVPTXDAGToDAGISel::allowFMA()
const {
81 const NVPTXTargetLowering *TL =
Subtarget->getTargetLowering();
85bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
89void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
91 if (
N->isMachineOpcode()) {
96 switch (
N->getOpcode()) {
98 case ISD::ATOMIC_LOAD:
103 case ISD::ATOMIC_STORE:
107 case ISD::ATOMIC_FENCE:
115 if (tryEXTRACT_VECTOR_ELEMENT(
N))
122 SelectSETP_BF16X2(
N);
127 if (tryLoadVector(
N))
138 if (tryStoreVector(
N))
142 if (tryIntrinsicChain(
N))
146 if (tryIntrinsicVoid(
N))
156 case ISD::ADDRSPACECAST:
157 SelectAddrSpaceCast(
N);
160 if (
N->getOperand(1).getValueType() == MVT::i128) {
161 SelectV2I64toI128(
N);
167 if (
N->getOperand(1).getValueType() == MVT::i128) {
168 SelectI128toV2I64(
N);
175 selectAtomicSwap128(
N);
180 if (tryBF16ArithToFMA(
N))
189#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
190 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
191 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
195 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
197 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
199 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
201 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
203 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
205 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
207 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
209 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
211 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
213 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
215 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
217 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
219 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
221 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
223 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
225 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
227 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
229 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
231 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
233 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
235 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
237 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
239 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
241 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
243 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
245 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
247 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
249 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
251 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
253 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
255 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
257 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
259 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
261 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
263 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
265 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
267 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
273void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
279 auto OffsetNode =
CurDAG->getTargetConstant(
283 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
288 {N->getOperand(2), N->getOperand(0)}));
292bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
293 unsigned IID =
N->getConstantOperandVal(1);
297 case Intrinsic::nvvm_ldu_global_f:
298 case Intrinsic::nvvm_ldu_global_i:
299 case Intrinsic::nvvm_ldu_global_p:
302 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
303 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
304 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
305 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
306 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
307 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
308 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
309 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
310 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
311 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
312 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
313 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
314 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
315 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
316 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
317 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
318 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
319 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
320 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
321 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
322 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
323 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
324 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
325 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
326 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
327 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
328 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
329 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
330 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
335 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
336 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
337 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
338 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
339 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
340 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
341 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
342 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
343 SelectTcgen05Ld(
N,
true);
378 return CmpMode::NotANumber;
393 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
396bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
399 SDNode *SetP =
CurDAG->getMachineNode(
400 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
401 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
402 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
407bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
410 SDNode *SetP =
CurDAG->getMachineNode(
411 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
412 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
413 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
418bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
420 MVT EltVT =
N->getSimpleValueType(0);
423 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(
N), EltVT, EltVT,
Vector);
431bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
434 MVT VT =
Vector.getSimpleValueType();
440 Opcode = NVPTX::I32toV2I16;
442 Opcode = NVPTX::I64toV2I32;
448 for (
auto *U :
Vector.getNode()->users()) {
451 if (
U->getOperand(0) !=
Vector)
453 if (
const ConstantSDNode *IdxConst =
455 if (IdxConst->getZExtValue() == 0)
457 else if (IdxConst->getZExtValue() == 1)
473 CurDAG->getMachineNode(Opcode, SDLoc(
N), EltVT, EltVT,
Vector);
474 for (
auto *Node : E0)
476 for (
auto *Node : E1)
482static std::optional<NVPTX::AddressSpace>
convertAS(
unsigned AS) {
504 return convertAS(
N->getMemOperand()->getAddrSpace())
512 auto Ordering =
N->getMergedOrdering();
536 return Scopes[
N->getSyncScopeID()];
541struct OperationOrderings {
542 NVPTX::Ordering InstructionOrdering, FenceOrdering;
543 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
544 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
545 : InstructionOrdering(IO), FenceOrdering(FO) {}
548static OperationOrderings
650 !HasMemoryOrdering) {
652 formatv(
"PTX does not support \"atomic\" for orderings different than"
653 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
665 bool AddrGenericOrGlobalOrShared =
670 if (!AddrGenericOrGlobalOrShared)
673 bool UseRelaxedMMIO =
695 formatv(
"PTX only supports Acquire Ordering on reads: {}",
696 N->getOperationName()));
701 formatv(
"PTX only supports Release Ordering on writes: {}",
702 N->getOperationName()));
706 formatv(
"NVPTX does not support AcquireRelease Ordering on "
708 "yet and PTX does not support it on loads or stores: {}",
709 N->getOperationName()));
722 else if (
N->writeMem())
726 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
727 "read-modify-writes yet: {}",
728 N->getOperationName()));
729 return OperationOrderings(InstrOrder,
734 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
757 auto S = Scopes[
N->getSyncScopeID()];
766 Subtarget->failIfClustersUnsupported(
"cluster scope");
785 T->failIfClustersUnsupported(
".cluster scope fence");
788 if (!
T->hasSplitAcquireAndReleaseFences() &&
796 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
797 : NVPTX::INT_MEMBAR_SYS;
799 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
800 : NVPTX::INT_MEMBAR_CTA;
802 return NVPTX::atomic_thread_fence_acquire_cluster;
804 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
805 : NVPTX::INT_MEMBAR_GL;
809 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
816 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
817 : NVPTX::INT_MEMBAR_SYS;
819 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
820 : NVPTX::INT_MEMBAR_CTA;
822 return NVPTX::atomic_thread_fence_release_cluster;
824 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
825 : NVPTX::INT_MEMBAR_GL;
829 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
836 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
837 : NVPTX::INT_MEMBAR_SYS;
839 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
840 : NVPTX::INT_MEMBAR_CTA;
842 return NVPTX::atomic_thread_fence_acq_rel_cluster;
844 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
845 : NVPTX::INT_MEMBAR_GL;
849 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
857 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
858 : NVPTX::INT_MEMBAR_SYS;
860 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
861 : NVPTX::INT_MEMBAR_CTA;
863 return NVPTX::atomic_thread_fence_seq_cst_cluster;
865 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
866 : NVPTX::INT_MEMBAR_GL;
879 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
880 OrderingToString(O), ScopeToString(S)));
888std::pair<NVPTX::Ordering, NVPTX::Scope>
889NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
891 auto [InstructionOrdering, FenceOrdering] =
893 auto Scope = getOperationScope(
N, InstructionOrdering);
906 formatv(
"Unexpected fence ordering: \"{}\".",
909 return {InstructionOrdering,
Scope};
912void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
918 assert(SrcAddrSpace != DstAddrSpace &&
919 "addrspacecast must be between different address spaces");
924 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
927 SDNode *Cvt =
CurDAG->getMachineNode(NVPTX::CVT_u64_u32,
DL, MVT::i64,
933 switch (SrcAddrSpace) {
936 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
939 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
944 "Shared cluster address space is only supported in 64-bit mode");
945 Opc = NVPTX::cvta_shared_cluster_64;
948 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
951 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
954 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
961 if (SrcAddrSpace != 0)
964 switch (DstAddrSpace) {
967 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
970 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
975 "Shared cluster address space is only supported in 64-bit mode");
976 Opc = NVPTX::cvta_to_shared_cluster_64;
979 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
982 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
985 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
989 SDNode *CVTA =
CurDAG->getMachineNode(
Opc,
DL,
N->getValueType(0), Src);
990 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
993 CVTA =
CurDAG->getMachineNode(NVPTX::CVT_u32_u64,
DL, MVT::i32,
1004static std::optional<unsigned>
1006 std::optional<unsigned> Opcode_i32,
1007 std::optional<unsigned> Opcode_i64) {
1025 return std::nullopt;
1030 return V.getOpcode() ==
ISD::ADD ||
1031 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1036 N =
N.getOperand(0);
1046 GA->getValueType(0), GA->getOffset(),
1047 GA->getTargetFlags());
1050 ES->getTargetFlags());
1059 APInt AccumulatedOffset(64u, 0);
1065 const APInt CI = CN->getAPIntValue().
sext(64);
1066 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1069 AccumulatedOffset += CI;
1095bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1097 assert(
LD->readMem() &&
"Expected load");
1101 if (PlainLoad && PlainLoad->
isIndexed())
1111 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1113 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1122 FromTypeWidth <= 128 &&
"Invalid width for load");
1127 getI32Imm(Scope,
DL),
1128 getI32Imm(CodeAddrSpace,
DL),
1129 getI32Imm(FromType,
DL),
1130 getI32Imm(FromTypeWidth,
DL),
1136 const std::optional<unsigned> Opcode =
1137 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1141 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1145 MachineMemOperand *MemRef =
LD->getMemOperand();
1153 switch (
N->getOpcode()) {
1165bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1173 const MVT EltVT =
LD->getSimpleValueType(0);
1176 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1186 const unsigned ExtensionType =
1187 N->getConstantOperandVal(
N->getNumOperands() - 1);
1190 : NVPTX::PTXLdStInstCode::
Untyped;
1198 getI32Imm(Scope,
DL),
1199 getI32Imm(CodeAddrSpace,
DL),
1200 getI32Imm(FromType,
DL),
1201 getI32Imm(FromTypeWidth,
DL),
1206 std::optional<unsigned> Opcode;
1207 switch (
N->getOpcode()) {
1212 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1216 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1220 NVPTX::LDV_i32_v8, {});
1226 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1228 MachineMemOperand *MemRef =
LD->getMemOperand();
1235bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1238 unsigned ExtensionType;
1240 ExtensionType =
Load->getExtensionType();
1242 ExtensionType =
LD->getConstantOperandVal(
LD->getNumOperands() - 1);
1246 : NVPTX::PTXLdStInstCode::
Untyped;
1250 assert(!(
LD->getSimpleValueType(0).isVector() &&
1258 std::optional<unsigned> Opcode;
1259 switch (
LD->getOpcode()) {
1264 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1269 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1274 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1278 NVPTX::LD_GLOBAL_NC_v8i32, {});
1284 SDNode *NVPTXLDG =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1293 auto ElementBitWidth = TotalWidth / NumElts;
1295 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1296 "Invalid width for load");
1297 return ElementBitWidth;
1300bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1315 std::optional<unsigned> Opcode;
1316 switch (
N->getOpcode()) {
1321 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1325 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1329 NVPTX::LDU_GLOBAL_v4i32, {});
1335 SDNode *NVPTXLDU =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1341bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1343 assert(
ST->writeMem() &&
"Expected store");
1346 assert((PlainStore || AtomicStore) &&
"Expected store");
1349 if (PlainStore && PlainStore->
isIndexed())
1357 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1360 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1366 "Invalid width for store");
1370 getI32Imm(Ordering,
DL),
1371 getI32Imm(Scope,
DL),
1372 getI32Imm(CodeAddrSpace,
DL),
1373 getI32Imm(ToTypeWidth,
DL),
1378 const std::optional<unsigned> Opcode =
1380 NVPTX::ST_i32, NVPTX::ST_i64);
1384 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1389 MachineMemOperand *MemRef =
ST->getMemOperand();
1395bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1397 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1408 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1413 for (
auto &V :
ST->ops().slice(1, NumElts))
1414 Ops.push_back(selectPossiblyImm(V));
1416 const unsigned ToTypeWidth = TotalWidth / NumElts;
1419 TotalWidth <= 256 &&
"Invalid width for store");
1422 Ops.append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1423 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1427 ST->getOperand(1).getSimpleValueType().SimpleTy;
1428 std::optional<unsigned> Opcode;
1429 switch (
ST->getOpcode()) {
1449 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1451 MachineMemOperand *MemRef =
ST->getMemOperand();
1460bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1467 bool IsSigned =
false;
1483 uint64_t MaskVal =
Mask->getZExtValue();
1493 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1497 Val =
LHS.getNode()->getOperand(0);
1498 Start =
LHS.getNode()->getOperand(1);
1504 int64_t GoodBits =
Start.getValueSizeInBits() - StartVal;
1505 if (NumBits > GoodBits) {
1563 NumBits = NumZeros + NumOnes - ShiftAmt;
1569 if (ShiftAmt < NumZeros) {
1577 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1593 Val =
LHS->getOperand(0);
1612 if (OuterShiftAmt < InnerShiftAmt) {
1623 Start =
CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt,
DL,
1648 Opc = NVPTX::BFE_S32rii;
1650 Opc = NVPTX::BFE_U32rii;
1654 Opc = NVPTX::BFE_S64rii;
1656 Opc = NVPTX::BFE_U64rii;
1672bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1673 EVT VT =
SDValue(
N, 0).getValueType();
1677 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1694 auto API = APF.bitcastToAPInt();
1695 API = API.concat(API);
1697 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_B32_i,
DL, VT, Const),
1701 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_BF16_i,
DL, VT, Const), 0);
1704 switch (
N->getOpcode()) {
1707 Operands = {N0, GetConstant(1.0), N1};
1711 Operands = {N1, GetConstant(-1.0), N0};
1716 Operands = {N0, N1, GetConstant(-0.0)};
1722 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1729 if (
V.getOpcode() == ISD::BITCAST)
1730 V =
V.getOperand(0);
1733 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1736 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1745 std::vector<SDValue> &OutOps) {
1746 switch (ConstraintID) {
1751 OutOps.push_back(
Base);
1752 OutOps.push_back(
Offset);
1759void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1778 NewOps[0] =
N->getOperand(0);
1781 if (
N->getNumOperands() == 5)
1782 NewOps[3] =
N->getOperand(4);
1788void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1805 SDNode *Mov =
CurDAG->getMachineNode(
1806 NVPTX::I128toV2I64,
DL,
1813bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1815 assert(
N->getOpcode() == ISD::ATOMIC_FENCE);
1816 unsigned int FenceOp =
1818 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
1820 SDNode *FenceNode =
CurDAG->getMachineNode(FenceOp,
DL, MVT::Other, Chain);
1836 "NVPTXScopes::operator[]");
1838 auto S = Scopes.find(
ID);
1839 if (S == Scopes.end()) {
1851#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1853 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1854 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1856#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1857 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1858 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1860#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1862 if (is_mc && is_ch) \
1863 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1865 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1867 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1868 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1888 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1909 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1916 bool IsCacheHint,
bool IsIm2Col) {
1921 IsCacheHint, IsShared32);
1924 IsCacheHint, IsShared32);
1927 IsCacheHint, IsShared32);
1930 "GetCpAsyncBulkTensorG2SOpcode.");
1936 IsCacheHint, IsShared32);
1939 IsCacheHint, IsShared32);
1942 IsCacheHint, IsShared32);
1945 IsCacheHint, IsShared32);
1948 IsCacheHint, IsShared32);
1951 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1958 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1959 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1961 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1962 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1964 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1965 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1972void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(
SDNode *
N,
1980 size_t NumOps =
N->getNumOperands();
1984 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1985 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 2) == 1;
1986 bool IsMultiCast =
N->getConstantOperandVal(
NumOps - 3) == 1;
1987 size_t NumBaseArgs = NumDims + NumOffsets + 3;
1988 size_t MultiCastIdx = NumBaseArgs + 2;
1990 unsigned CTAGroupVal =
N->getConstantOperandVal(
NumOps - 1);
1991 if ((CTAGroupVal > 0) && !
Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
1993 formatv(
"CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2001 Ops.push_back(
N->getOperand(MultiCastIdx));
2005 Ops.push_back(
N->getOperand(MultiCastIdx + 1));
2008 Ops.push_back(getI32Imm(CTAGroupVal,
DL));
2011 Ops.push_back(
N->getOperand(0));
2016 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2020void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
2027 size_t NumOps =
N->getNumOperands();
2028 size_t NumDims =
NumOps - 6;
2029 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 1) == 1;
2030 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
2034 Ops.push_back(getI32Imm(RedOp,
DL));
2035 Ops.push_back(
N->getOperand(0));
2040 NumDims, IsShared32, IsCacheHint, IsIm2Col);
2044#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2045 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2046 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2050 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2052 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2054 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2056 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2058 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2060 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2062 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2064 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2066 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2068 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2070 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2072 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2074 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2076 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2078 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2080 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2082 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2084 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2086 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2088 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2090 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2092 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2094 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2096 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2098 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2100 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2102 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2104 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2106 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2108 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2110 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2112 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2114 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2116 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2118 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2120 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2122 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2128void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2141 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2153bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2154 unsigned IID =
N->getConstantOperandVal(1);
2156 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2160 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2163 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2164 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2165 SelectCpAsyncBulkTensorG2SCommon(
N);
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2170 SelectCpAsyncBulkTensorG2SCommon(
N,
true);
2172 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2176 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2177 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2182 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2190 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2195 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2202 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2203 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2205 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2206 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2207 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2208 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2211 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2212 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2213 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2214 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2215 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2216 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2218 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2219 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2220 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2221 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2224 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2225 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2226 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2227 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2228 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2229 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2231 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2232 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2233 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2234 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2237 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2238 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2239 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2240 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2241 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2242 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2244 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2245 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2246 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2247 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2252 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2253 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2254 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2255 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2257 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2258 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2259 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2260 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2265 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2266 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2267 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2268 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2270 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2271 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2272 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2273 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2277 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2278 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2279 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2280 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2281 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2282 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2283 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2284 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2285 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2286 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2287 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2288 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2289 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2290 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2291 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2292 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2293 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2294 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2295 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2296 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2297 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2298 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2299 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2300 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2301 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2302 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2303 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2304 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2305 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2310 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2311 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2312 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2313 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2314 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2315 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2316 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2317 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2318 SelectTcgen05St(
N,
true);
2324void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2331 Ops.append(
N->op_begin() + 2,
N->op_end());
2333 getI32Imm(getMemOrder(AN), dl),
2334 getI32Imm(getAtomicScope(AN), dl),
2342 ? NVPTX::ATOM_EXCH_B128
2343 : NVPTX::ATOM_CAS_B128;
2345 auto *ATOM =
CurDAG->getMachineNode(Opcode, dl,
N->getVTList(),
Ops);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
mir Rename Register Operands
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static size_t GetDimsFromIntrinsic(unsigned IID)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
NVPTXDAGToDAGISel()=delete
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ ADD
Simple integer binary arithmetic operators.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_SHARED_CLUSTER
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const