LLVM 18.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
15#include "NVPTXUtilities.h"
18#include "llvm/IR/GlobalValue.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
23#include "llvm/Support/Debug.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "nvptx-isel"
31#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
32
33/// createNVPTXISelDag - This pass converts a legalized DAG into a
34/// NVPTX-specific DAG, ready for instruction scheduling.
36 llvm::CodeGenOptLevel OptLevel) {
37 return new NVPTXDAGToDAGISel(TM, OptLevel);
38}
39
41
43
45 CodeGenOptLevel OptLevel)
46 : SelectionDAGISel(ID, tm, OptLevel), TM(tm) {
47 doMulWide = (OptLevel > CodeGenOptLevel::None);
48}
49
53}
54
55int NVPTXDAGToDAGISel::getDivF32Level() const {
57}
58
59bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
61}
62
63bool NVPTXDAGToDAGISel::useF32FTZ() const {
65}
66
67bool NVPTXDAGToDAGISel::allowFMA() const {
69 return TL->allowFMA(*MF, OptLevel);
70}
71
72bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
74 return TL->allowUnsafeFPMath(*MF);
75}
76
77bool NVPTXDAGToDAGISel::useShortPointers() const {
78 return TM.useShortPointers();
79}
80
81/// Select - Select instructions not customized! Used for
82/// expanded, promoted and normal instructions.
83void NVPTXDAGToDAGISel::Select(SDNode *N) {
84
85 if (N->isMachineOpcode()) {
86 N->setNodeId(-1);
87 return; // Already selected.
88 }
89
90 switch (N->getOpcode()) {
91 case ISD::LOAD:
93 if (tryLoad(N))
94 return;
95 break;
96 case ISD::STORE:
98 if (tryStore(N))
99 return;
100 break;
102 if (tryEXTRACT_VECTOR_ELEMENT(N))
103 return;
104 break;
106 SelectSETP_F16X2(N);
107 return;
109 SelectSETP_BF16X2(N);
110 return;
111 case NVPTXISD::LoadV2:
112 case NVPTXISD::LoadV4:
113 if (tryLoadVector(N))
114 return;
115 break;
116 case NVPTXISD::LDGV2:
117 case NVPTXISD::LDGV4:
118 case NVPTXISD::LDUV2:
119 case NVPTXISD::LDUV4:
120 if (tryLDGLDU(N))
121 return;
122 break;
125 if (tryStoreVector(N))
126 return;
127 break;
131 if (tryLoadParam(N))
132 return;
133 break;
137 if (tryStoreRetval(N))
138 return;
139 break;
145 if (tryStoreParam(N))
146 return;
147 break;
149 if (tryIntrinsicNoChain(N))
150 return;
151 break;
153 if (tryIntrinsicChain(N))
154 return;
155 break;
324 if (tryTextureIntrinsic(N))
325 return;
326 break;
492 if (trySurfaceIntrinsic(N))
493 return;
494 break;
495 case ISD::AND:
496 case ISD::SRA:
497 case ISD::SRL:
498 // Try to select BFE
499 if (tryBFE(N))
500 return;
501 break;
503 SelectAddrSpaceCast(N);
504 return;
505 case ISD::ConstantFP:
506 if (tryConstantFP(N))
507 return;
508 break;
509 default:
510 break;
511 }
512 SelectCode(N);
513}
514
515bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
516 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
517 switch (IID) {
518 default:
519 return false;
520 case Intrinsic::nvvm_ldg_global_f:
521 case Intrinsic::nvvm_ldg_global_i:
522 case Intrinsic::nvvm_ldg_global_p:
523 case Intrinsic::nvvm_ldu_global_f:
524 case Intrinsic::nvvm_ldu_global_i:
525 case Intrinsic::nvvm_ldu_global_p:
526 return tryLDGLDU(N);
527 }
528}
529
530// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
531// have to load them into an .(b)f16 register first.
532bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
533 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
534 return false;
536 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
537 SDNode *LoadConstF16 = CurDAG->getMachineNode(
538 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
539 : NVPTX::LOAD_CONST_BF16),
540 SDLoc(N), N->getValueType(0), Val);
541 ReplaceNode(N, LoadConstF16);
542 return true;
543}
544
545// Map ISD:CONDCODE value to appropriate CmpMode expected by
546// NVPTXInstPrinter::printCmpMode()
547static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
549 unsigned PTXCmpMode = [](ISD::CondCode CC) {
550 switch (CC) {
551 default:
552 llvm_unreachable("Unexpected condition code.");
553 case ISD::SETOEQ:
554 return CmpMode::EQ;
555 case ISD::SETOGT:
556 return CmpMode::GT;
557 case ISD::SETOGE:
558 return CmpMode::GE;
559 case ISD::SETOLT:
560 return CmpMode::LT;
561 case ISD::SETOLE:
562 return CmpMode::LE;
563 case ISD::SETONE:
564 return CmpMode::NE;
565 case ISD::SETO:
566 return CmpMode::NUM;
567 case ISD::SETUO:
568 return CmpMode::NotANumber;
569 case ISD::SETUEQ:
570 return CmpMode::EQU;
571 case ISD::SETUGT:
572 return CmpMode::GTU;
573 case ISD::SETUGE:
574 return CmpMode::GEU;
575 case ISD::SETULT:
576 return CmpMode::LTU;
577 case ISD::SETULE:
578 return CmpMode::LEU;
579 case ISD::SETUNE:
580 return CmpMode::NEU;
581 case ISD::SETEQ:
582 return CmpMode::EQ;
583 case ISD::SETGT:
584 return CmpMode::GT;
585 case ISD::SETGE:
586 return CmpMode::GE;
587 case ISD::SETLT:
588 return CmpMode::LT;
589 case ISD::SETLE:
590 return CmpMode::LE;
591 case ISD::SETNE:
592 return CmpMode::NE;
593 }
594 }(CondCode.get());
595
596 if (FTZ)
597 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
598
599 return PTXCmpMode;
600}
601
602bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
603 unsigned PTXCmpMode =
604 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
605 SDLoc DL(N);
607 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
608 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
609 ReplaceNode(N, SetP);
610 return true;
611}
612
613bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
614 unsigned PTXCmpMode =
615 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
616 SDLoc DL(N);
618 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
619 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
620 ReplaceNode(N, SetP);
621 return true;
622}
623
624// Find all instances of extract_vector_elt that use this v2f16 vector
625// and coalesce them into a scattering move instruction.
626bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
627 SDValue Vector = N->getOperand(0);
628
629 // We only care about 16x2 as it's the only real vector type we
630 // need to deal with.
631 MVT VT = Vector.getSimpleValueType();
632 if (!Isv2x16VT(VT))
633 return false;
634 // Find and record all uses of this vector that extract element 0 or 1.
636 for (auto *U : Vector.getNode()->uses()) {
637 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
638 continue;
639 if (U->getOperand(0) != Vector)
640 continue;
641 if (const ConstantSDNode *IdxConst =
642 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
643 if (IdxConst->getZExtValue() == 0)
644 E0.push_back(U);
645 else if (IdxConst->getZExtValue() == 1)
646 E1.push_back(U);
647 else
648 llvm_unreachable("Invalid vector index.");
649 }
650 }
651
652 // There's no point scattering f16x2 if we only ever access one
653 // element of it.
654 if (E0.empty() || E1.empty())
655 return false;
656
657 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
658 // into f16,f16 SplitF16x2(V)
659 MVT EltVT = VT.getVectorElementType();
660 SDNode *ScatterOp =
661 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
662 for (auto *Node : E0)
663 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
664 for (auto *Node : E1)
665 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
666
667 return true;
668}
669
670static unsigned int getCodeAddrSpace(MemSDNode *N) {
671 const Value *Src = N->getMemOperand()->getValue();
672
673 if (!Src)
675
676 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
677 switch (PT->getAddressSpace()) {
684 default: break;
685 }
686 }
688}
689
690static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
691 unsigned CodeAddrSpace, MachineFunction *F) {
692 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
693 // space.
694 //
695 // We have two ways of identifying invariant loads: Loads may be explicitly
696 // marked as invariant, or we may infer them to be invariant.
697 //
698 // We currently infer invariance for loads from
699 // - constant global variables, and
700 // - kernel function pointer params that are noalias (i.e. __restrict) and
701 // never written to.
702 //
703 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
704 // not during the SelectionDAG phase).
705 //
706 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
707 // explicitly invariant loads because these are how clang tells us to use ldg
708 // when the user uses a builtin.
709 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
710 return false;
711
712 if (N->isInvariant())
713 return true;
714
715 bool IsKernelFn = isKernelFunction(F->getFunction());
716
717 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
718 // because the former looks through phi nodes while the latter does not. We
719 // need to look through phi nodes to handle pointer induction variables.
721 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
722
723 return all_of(Objs, [&](const Value *V) {
724 if (auto *A = dyn_cast<const Argument>(V))
725 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
726 if (auto *GV = dyn_cast<const GlobalVariable>(V))
727 return GV->isConstant();
728 return false;
729 });
730}
731
732bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
733 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
734 switch (IID) {
735 default:
736 return false;
737 case Intrinsic::nvvm_texsurf_handle_internal:
738 SelectTexSurfHandle(N);
739 return true;
740 }
741}
742
743void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
744 // Op 0 is the intrinsic ID
745 SDValue Wrapper = N->getOperand(1);
746 SDValue GlobalVal = Wrapper.getOperand(0);
747 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
748 MVT::i64, GlobalVal));
749}
750
751void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
752 SDValue Src = N->getOperand(0);
753 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
754 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
755 unsigned DstAddrSpace = CastN->getDestAddressSpace();
756 assert(SrcAddrSpace != DstAddrSpace &&
757 "addrspacecast must be between different address spaces");
758
759 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
760 // Specific to generic
761 unsigned Opc;
762 switch (SrcAddrSpace) {
763 default: report_fatal_error("Bad address space in addrspacecast");
765 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
766 break;
768 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
769 : NVPTX::cvta_shared_yes_64)
770 : NVPTX::cvta_shared_yes;
771 break;
773 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
774 : NVPTX::cvta_const_yes_64)
775 : NVPTX::cvta_const_yes;
776 break;
778 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
779 : NVPTX::cvta_local_yes_64)
780 : NVPTX::cvta_local_yes;
781 break;
782 }
783 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
784 Src));
785 return;
786 } else {
787 // Generic to specific
788 if (SrcAddrSpace != 0)
789 report_fatal_error("Cannot cast between two non-generic address spaces");
790 unsigned Opc;
791 switch (DstAddrSpace) {
792 default: report_fatal_error("Bad address space in addrspacecast");
794 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
795 : NVPTX::cvta_to_global_yes;
796 break;
798 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
799 : NVPTX::cvta_to_shared_yes_64)
800 : NVPTX::cvta_to_shared_yes;
801 break;
803 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
804 : NVPTX::cvta_to_const_yes_64)
805 : NVPTX::cvta_to_const_yes;
806 break;
808 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
809 : NVPTX::cvta_to_local_yes_64)
810 : NVPTX::cvta_to_local_yes;
811 break;
813 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
814 : NVPTX::nvvm_ptr_gen_to_param;
815 break;
816 }
817 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
818 Src));
819 return;
820 }
821}
822
823// Helper function template to reduce amount of boilerplate code for
824// opcode selection.
825static std::optional<unsigned>
827 unsigned Opcode_i16, unsigned Opcode_i32,
828 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
829 std::optional<unsigned> Opcode_f64) {
830 switch (VT) {
831 case MVT::i1:
832 case MVT::i8:
833 return Opcode_i8;
834 case MVT::i16:
835 return Opcode_i16;
836 case MVT::i32:
837 return Opcode_i32;
838 case MVT::i64:
839 return Opcode_i64;
840 case MVT::f16:
841 case MVT::bf16:
842 return Opcode_i16;
843 case MVT::v2f16:
844 case MVT::v2bf16:
845 case MVT::v2i16:
846 case MVT::v4i8:
847 return Opcode_i32;
848 case MVT::f32:
849 return Opcode_f32;
850 case MVT::f64:
851 return Opcode_f64;
852 default:
853 return std::nullopt;
854 }
855}
856
857static int getLdStRegType(EVT VT) {
858 if (VT.isFloatingPoint())
859 switch (VT.getSimpleVT().SimpleTy) {
860 case MVT::f16:
861 case MVT::bf16:
862 case MVT::v2f16:
863 case MVT::v2bf16:
865 default:
867 }
868 else
870}
871
872bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
873 SDLoc dl(N);
874 MemSDNode *LD = cast<MemSDNode>(N);
875 assert(LD->readMem() && "Expected load");
876 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
877 EVT LoadedVT = LD->getMemoryVT();
878 SDNode *NVPTXLD = nullptr;
879
880 // do not support pre/post inc/dec
881 if (PlainLoad && PlainLoad->isIndexed())
882 return false;
883
884 if (!LoadedVT.isSimple())
885 return false;
886
887 AtomicOrdering Ordering = LD->getSuccessOrdering();
888 // In order to lower atomic loads with stronger guarantees we would need to
889 // use load.acquire or insert fences. However these features were only added
890 // with PTX ISA 6.0 / sm_70.
891 // TODO: Check if we can actually use the new instructions and implement them.
892 if (isStrongerThanMonotonic(Ordering))
893 return false;
894
895 // Address Space Setting
896 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
897 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
898 return tryLDGLDU(N);
899 }
900
901 unsigned int PointerSize =
902 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
903
904 // Volatile Setting
905 // - .volatile is only available for .global and .shared
906 // - .volatile has the same memory synchronization semantics as .relaxed.sys
907 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
908 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
909 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
910 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
911 isVolatile = false;
912
913 // Type Setting: fromType + fromTypeWidth
914 //
915 // Sign : ISD::SEXTLOAD
916 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
917 // type is integer
918 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
919 MVT SimpleVT = LoadedVT.getSimpleVT();
920 MVT ScalarVT = SimpleVT.getScalarType();
921 // Read at least 8 bits (predicates are stored as 8-bit values)
922 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
923 unsigned int fromType;
924
925 // Vector Setting
926 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
927 if (SimpleVT.isVector()) {
928 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
929 "Unexpected vector type");
930 // v2f16/v2bf16/v2i16 is loaded using ld.b32
931 fromTypeWidth = 32;
932 }
933
934 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
936 else
937 fromType = getLdStRegType(ScalarVT);
938
939 // Create the machine instruction DAG
940 SDValue Chain = N->getOperand(0);
941 SDValue N1 = N->getOperand(1);
944 std::optional<unsigned> Opcode;
945 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
946
947 if (SelectDirectAddr(N1, Addr)) {
948 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
949 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
950 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
951 if (!Opcode)
952 return false;
953 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
954 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
955 getI32Imm(fromTypeWidth, dl), Addr, Chain };
956 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
957 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
958 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
959 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
960 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
961 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
962 if (!Opcode)
963 return false;
964 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
965 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
966 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
967 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
968 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
969 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
970 if (PointerSize == 64)
971 Opcode =
972 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
973 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
974 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
975 else
976 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
977 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
978 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
979 if (!Opcode)
980 return false;
981 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
982 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
983 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
984 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
985 } else {
986 if (PointerSize == 64)
987 Opcode =
988 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
989 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
990 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
991 else
992 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
993 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
994 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
995 if (!Opcode)
996 return false;
997 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
998 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
999 getI32Imm(fromTypeWidth, dl), N1, Chain };
1000 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1001 }
1002
1003 if (!NVPTXLD)
1004 return false;
1005
1006 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1007 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1008
1009 ReplaceNode(N, NVPTXLD);
1010 return true;
1011}
1012
1013bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1014
1015 SDValue Chain = N->getOperand(0);
1016 SDValue Op1 = N->getOperand(1);
1018 std::optional<unsigned> Opcode;
1019 SDLoc DL(N);
1020 SDNode *LD;
1021 MemSDNode *MemSD = cast<MemSDNode>(N);
1022 EVT LoadedVT = MemSD->getMemoryVT();
1023
1024 if (!LoadedVT.isSimple())
1025 return false;
1026
1027 // Address Space Setting
1028 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1029 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1030 return tryLDGLDU(N);
1031 }
1032
1033 unsigned int PointerSize =
1035
1036 // Volatile Setting
1037 // - .volatile is only availalble for .global and .shared
1038 bool IsVolatile = MemSD->isVolatile();
1039 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1040 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1041 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1042 IsVolatile = false;
1043
1044 // Vector Setting
1045 MVT SimpleVT = LoadedVT.getSimpleVT();
1046
1047 // Type Setting: fromType + fromTypeWidth
1048 //
1049 // Sign : ISD::SEXTLOAD
1050 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1051 // type is integer
1052 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1053 MVT ScalarVT = SimpleVT.getScalarType();
1054 // Read at least 8 bits (predicates are stored as 8-bit values)
1055 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1056 unsigned int FromType;
1057 // The last operand holds the original LoadSDNode::getExtensionType() value
1058 unsigned ExtensionType = cast<ConstantSDNode>(
1059 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1060 if (ExtensionType == ISD::SEXTLOAD)
1062 else
1063 FromType = getLdStRegType(ScalarVT);
1064
1065 unsigned VecType;
1066
1067 switch (N->getOpcode()) {
1068 case NVPTXISD::LoadV2:
1070 break;
1071 case NVPTXISD::LoadV4:
1073 break;
1074 default:
1075 return false;
1076 }
1077
1078 EVT EltVT = N->getValueType(0);
1079
1080 // v8x16 is a special case. PTX doesn't have ld.v8.16
1081 // instruction. Instead, we split the vector into v2x16 chunks and
1082 // load them with ld.v4.b32.
1083 if (Isv2x16VT(EltVT)) {
1084 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1085 EltVT = MVT::i32;
1087 FromTypeWidth = 32;
1088 }
1089
1090 if (SelectDirectAddr(Op1, Addr)) {
1091 switch (N->getOpcode()) {
1092 default:
1093 return false;
1094 case NVPTXISD::LoadV2:
1096 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1097 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1098 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1099 break;
1100 case NVPTXISD::LoadV4:
1101 Opcode =
1102 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1103 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1104 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1105 break;
1106 }
1107 if (!Opcode)
1108 return false;
1109 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1110 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1111 getI32Imm(FromTypeWidth, DL), Addr, Chain };
1112 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1113 } else if (PointerSize == 64
1114 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1115 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1116 switch (N->getOpcode()) {
1117 default:
1118 return false;
1119 case NVPTXISD::LoadV2:
1121 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1122 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1123 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1124 break;
1125 case NVPTXISD::LoadV4:
1126 Opcode =
1127 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1128 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1129 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1130 break;
1131 }
1132 if (!Opcode)
1133 return false;
1134 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1135 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1136 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1137 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1138 } else if (PointerSize == 64
1139 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1140 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1141 if (PointerSize == 64) {
1142 switch (N->getOpcode()) {
1143 default:
1144 return false;
1145 case NVPTXISD::LoadV2:
1146 Opcode =
1148 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1149 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1150 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1151 break;
1152 case NVPTXISD::LoadV4:
1154 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1155 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1156 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1157 break;
1158 }
1159 } else {
1160 switch (N->getOpcode()) {
1161 default:
1162 return false;
1163 case NVPTXISD::LoadV2:
1165 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1166 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1167 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1168 break;
1169 case NVPTXISD::LoadV4:
1170 Opcode =
1171 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1172 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1173 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1174 break;
1175 }
1176 }
1177 if (!Opcode)
1178 return false;
1179 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1180 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1181 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1182
1183 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1184 } else {
1185 if (PointerSize == 64) {
1186 switch (N->getOpcode()) {
1187 default:
1188 return false;
1189 case NVPTXISD::LoadV2:
1191 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1192 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1193 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1194 NVPTX::LDV_f64_v2_areg_64);
1195 break;
1196 case NVPTXISD::LoadV4:
1198 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1199 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1200 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1201 break;
1202 }
1203 } else {
1204 switch (N->getOpcode()) {
1205 default:
1206 return false;
1207 case NVPTXISD::LoadV2:
1208 Opcode =
1209 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1210 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1211 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1212 NVPTX::LDV_f64_v2_areg);
1213 break;
1214 case NVPTXISD::LoadV4:
1215 Opcode =
1216 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1217 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1218 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1219 break;
1220 }
1221 }
1222 if (!Opcode)
1223 return false;
1224 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1225 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1226 getI32Imm(FromTypeWidth, DL), Op1, Chain };
1227 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1228 }
1229
1230 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1231 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1232
1233 ReplaceNode(N, LD);
1234 return true;
1235}
1236
1237bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1238
1239 SDValue Chain = N->getOperand(0);
1240 SDValue Op1;
1241 MemSDNode *Mem;
1242 bool IsLDG = true;
1243
1244 // If this is an LDG intrinsic, the address is the third operand. If its an
1245 // LDG/LDU SD node (from custom vector handling), then its the second operand
1246 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1247 Op1 = N->getOperand(2);
1248 Mem = cast<MemIntrinsicSDNode>(N);
1249 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1250 switch (IID) {
1251 default:
1252 return false;
1253 case Intrinsic::nvvm_ldg_global_f:
1254 case Intrinsic::nvvm_ldg_global_i:
1255 case Intrinsic::nvvm_ldg_global_p:
1256 IsLDG = true;
1257 break;
1258 case Intrinsic::nvvm_ldu_global_f:
1259 case Intrinsic::nvvm_ldu_global_i:
1260 case Intrinsic::nvvm_ldu_global_p:
1261 IsLDG = false;
1262 break;
1263 }
1264 } else {
1265 Op1 = N->getOperand(1);
1266 Mem = cast<MemSDNode>(N);
1267 }
1268
1269 std::optional<unsigned> Opcode;
1270 SDLoc DL(N);
1271 SDNode *LD;
1273 EVT OrigType = N->getValueType(0);
1274
1275 EVT EltVT = Mem->getMemoryVT();
1276 unsigned NumElts = 1;
1277 if (EltVT.isVector()) {
1278 NumElts = EltVT.getVectorNumElements();
1279 EltVT = EltVT.getVectorElementType();
1280 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1281 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1282 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1283 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1284 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1285 EltVT = OrigType;
1286 NumElts /= 2;
1287 } else if (OrigType == MVT::v4i8) {
1288 EltVT = OrigType;
1289 NumElts = 1;
1290 }
1291 }
1292
1293 // Build the "promoted" result VTList for the load. If we are really loading
1294 // i8s, then the return type will be promoted to i16 since we do not expose
1295 // 8-bit registers in NVPTX.
1296 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1297 SmallVector<EVT, 5> InstVTs;
1298 for (unsigned i = 0; i != NumElts; ++i) {
1299 InstVTs.push_back(NodeVT);
1300 }
1301 InstVTs.push_back(MVT::Other);
1302 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1303
1304 if (SelectDirectAddr(Op1, Addr)) {
1305 switch (N->getOpcode()) {
1306 default:
1307 return false;
1308 case ISD::LOAD:
1310 if (IsLDG)
1312 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1313 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1314 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1315 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1316 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1317 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1318 else
1320 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1321 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1322 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1323 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1324 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1325 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1326 break;
1327 case NVPTXISD::LoadV2:
1328 case NVPTXISD::LDGV2:
1330 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1331 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1332 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1333 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1334 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1335 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1336 break;
1337 case NVPTXISD::LDUV2:
1339 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1340 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1341 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1342 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1343 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1344 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1345 break;
1346 case NVPTXISD::LoadV4:
1347 case NVPTXISD::LDGV4:
1349 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1350 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1351 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1352 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1353 break;
1354 case NVPTXISD::LDUV4:
1356 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1357 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1358 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1359 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1360 break;
1361 }
1362 if (!Opcode)
1363 return false;
1364 SDValue Ops[] = { Addr, Chain };
1365 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1366 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1367 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1368 if (TM.is64Bit()) {
1369 switch (N->getOpcode()) {
1370 default:
1371 return false;
1372 case ISD::LOAD:
1374 if (IsLDG)
1376 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1377 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1378 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1379 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1380 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1381 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1382 else
1384 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1385 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1386 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1387 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1388 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1389 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1390 break;
1391 case NVPTXISD::LoadV2:
1392 case NVPTXISD::LDGV2:
1394 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1395 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1396 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1397 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1398 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1399 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1400 break;
1401 case NVPTXISD::LDUV2:
1403 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1404 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1405 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1406 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1407 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1408 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1409 break;
1410 case NVPTXISD::LoadV4:
1411 case NVPTXISD::LDGV4:
1413 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1414 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1415 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1416 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1417 break;
1418 case NVPTXISD::LDUV4:
1420 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1421 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1422 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1423 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1424 break;
1425 }
1426 } else {
1427 switch (N->getOpcode()) {
1428 default:
1429 return false;
1430 case ISD::LOAD:
1432 if (IsLDG)
1434 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1435 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1436 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1437 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1438 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1439 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1440 else
1442 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1443 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1444 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1445 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1446 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1447 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1448 break;
1449 case NVPTXISD::LoadV2:
1450 case NVPTXISD::LDGV2:
1452 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1453 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1454 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1455 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1456 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1457 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1458 break;
1459 case NVPTXISD::LDUV2:
1461 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1462 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1463 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1464 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1465 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1466 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1467 break;
1468 case NVPTXISD::LoadV4:
1469 case NVPTXISD::LDGV4:
1471 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1472 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1473 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1474 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1475 break;
1476 case NVPTXISD::LDUV4:
1478 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1479 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1480 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1481 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1482 break;
1483 }
1484 }
1485 if (!Opcode)
1486 return false;
1487 SDValue Ops[] = {Base, Offset, Chain};
1488 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1489 } else {
1490 if (TM.is64Bit()) {
1491 switch (N->getOpcode()) {
1492 default:
1493 return false;
1494 case ISD::LOAD:
1496 if (IsLDG)
1498 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1499 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1500 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1501 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1502 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1503 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1504 else
1506 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1507 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1508 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1509 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1510 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1511 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1512 break;
1513 case NVPTXISD::LoadV2:
1514 case NVPTXISD::LDGV2:
1516 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1517 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1518 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1519 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1520 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1521 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1522 break;
1523 case NVPTXISD::LDUV2:
1525 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1526 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1527 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1528 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1529 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1530 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1531 break;
1532 case NVPTXISD::LoadV4:
1533 case NVPTXISD::LDGV4:
1535 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1536 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1537 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1538 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1539 break;
1540 case NVPTXISD::LDUV4:
1542 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1543 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1544 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1545 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1546 break;
1547 }
1548 } else {
1549 switch (N->getOpcode()) {
1550 default:
1551 return false;
1552 case ISD::LOAD:
1554 if (IsLDG)
1556 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1557 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1558 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1559 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1560 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1561 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1562 else
1564 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1565 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1566 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1567 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1568 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1569 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1570 break;
1571 case NVPTXISD::LoadV2:
1572 case NVPTXISD::LDGV2:
1574 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1575 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1576 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1577 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1578 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1579 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1580 break;
1581 case NVPTXISD::LDUV2:
1583 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1584 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1585 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1586 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1587 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1588 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1589 break;
1590 case NVPTXISD::LoadV4:
1591 case NVPTXISD::LDGV4:
1593 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1594 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1595 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1596 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1597 break;
1598 case NVPTXISD::LDUV4:
1600 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1601 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1602 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1603 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1604 break;
1605 }
1606 }
1607 if (!Opcode)
1608 return false;
1609 SDValue Ops[] = { Op1, Chain };
1610 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1611 }
1612
1613 // For automatic generation of LDG (through SelectLoad[Vector], not the
1614 // intrinsics), we may have an extending load like:
1615 //
1616 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1617 //
1618 // In this case, the matching logic above will select a load for the original
1619 // memory type (in this case, i8) and our types will not match (the node needs
1620 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1621 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1622 // CVT instruction. Ptxas should clean up any redundancies here.
1623
1624 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1625
1626 if (OrigType != EltVT &&
1627 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1628 // We have an extending-load. The instruction we selected operates on the
1629 // smaller type, but the SDNode we are replacing has the larger type. We
1630 // need to emit a CVT to make the types match.
1631 unsigned CvtOpc =
1632 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1633
1634 // For each output value, apply the manual sign/zero-extension and make sure
1635 // all users of the load go through that CVT.
1636 for (unsigned i = 0; i != NumElts; ++i) {
1637 SDValue Res(LD, i);
1638 SDValue OrigVal(N, i);
1639
1640 SDNode *CvtNode =
1641 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1643 DL, MVT::i32));
1644 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1645 }
1646 }
1647
1648 ReplaceNode(N, LD);
1649 return true;
1650}
1651
1652bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1653 SDLoc dl(N);
1654 MemSDNode *ST = cast<MemSDNode>(N);
1655 assert(ST->writeMem() && "Expected store");
1656 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1657 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1658 assert((PlainStore || AtomicStore) && "Expected store");
1659 EVT StoreVT = ST->getMemoryVT();
1660 SDNode *NVPTXST = nullptr;
1661
1662 // do not support pre/post inc/dec
1663 if (PlainStore && PlainStore->isIndexed())
1664 return false;
1665
1666 if (!StoreVT.isSimple())
1667 return false;
1668
1669 AtomicOrdering Ordering = ST->getSuccessOrdering();
1670 // In order to lower atomic loads with stronger guarantees we would need to
1671 // use store.release or insert fences. However these features were only added
1672 // with PTX ISA 6.0 / sm_70.
1673 // TODO: Check if we can actually use the new instructions and implement them.
1674 if (isStrongerThanMonotonic(Ordering))
1675 return false;
1676
1677 // Address Space Setting
1678 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1679 unsigned int PointerSize =
1680 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1681
1682 // Volatile Setting
1683 // - .volatile is only available for .global and .shared
1684 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1685 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1686 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1687 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1688 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1689 isVolatile = false;
1690
1691 // Vector Setting
1692 MVT SimpleVT = StoreVT.getSimpleVT();
1693 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1694
1695 // Type Setting: toType + toTypeWidth
1696 // - for integer type, always use 'u'
1697 //
1698 MVT ScalarVT = SimpleVT.getScalarType();
1699 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1700 if (SimpleVT.isVector()) {
1701 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1702 "Unexpected vector type");
1703 // v2x16 is stored using st.b32
1704 toTypeWidth = 32;
1705 }
1706
1707 unsigned int toType = getLdStRegType(ScalarVT);
1708
1709 // Create the machine instruction DAG
1710 SDValue Chain = ST->getChain();
1711 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1712 SDValue BasePtr = ST->getBasePtr();
1713 SDValue Addr;
1715 std::optional<unsigned> Opcode;
1716 MVT::SimpleValueType SourceVT =
1717 Value.getNode()->getSimpleValueType(0).SimpleTy;
1718
1719 if (SelectDirectAddr(BasePtr, Addr)) {
1720 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1721 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1722 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1723 if (!Opcode)
1724 return false;
1725 SDValue Ops[] = {Value,
1726 getI32Imm(isVolatile, dl),
1727 getI32Imm(CodeAddrSpace, dl),
1728 getI32Imm(vecType, dl),
1729 getI32Imm(toType, dl),
1730 getI32Imm(toTypeWidth, dl),
1731 Addr,
1732 Chain};
1733 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1734 } else if (PointerSize == 64
1735 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1736 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1737 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1738 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1739 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1740 if (!Opcode)
1741 return false;
1742 SDValue Ops[] = {Value,
1743 getI32Imm(isVolatile, dl),
1744 getI32Imm(CodeAddrSpace, dl),
1745 getI32Imm(vecType, dl),
1746 getI32Imm(toType, dl),
1747 getI32Imm(toTypeWidth, dl),
1748 Base,
1749 Offset,
1750 Chain};
1751 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1752 } else if (PointerSize == 64
1753 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1754 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1755 if (PointerSize == 64)
1756 Opcode =
1757 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1758 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1759 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1760 else
1761 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1762 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1763 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1764 if (!Opcode)
1765 return false;
1766
1767 SDValue Ops[] = {Value,
1768 getI32Imm(isVolatile, dl),
1769 getI32Imm(CodeAddrSpace, dl),
1770 getI32Imm(vecType, dl),
1771 getI32Imm(toType, dl),
1772 getI32Imm(toTypeWidth, dl),
1773 Base,
1774 Offset,
1775 Chain};
1776 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1777 } else {
1778 if (PointerSize == 64)
1779 Opcode =
1780 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1781 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1782 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1783 else
1784 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1785 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1786 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1787 if (!Opcode)
1788 return false;
1789 SDValue Ops[] = {Value,
1790 getI32Imm(isVolatile, dl),
1791 getI32Imm(CodeAddrSpace, dl),
1792 getI32Imm(vecType, dl),
1793 getI32Imm(toType, dl),
1794 getI32Imm(toTypeWidth, dl),
1795 BasePtr,
1796 Chain};
1797 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1798 }
1799
1800 if (!NVPTXST)
1801 return false;
1802
1803 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1804 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1805 ReplaceNode(N, NVPTXST);
1806 return true;
1807}
1808
1809bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1810 SDValue Chain = N->getOperand(0);
1811 SDValue Op1 = N->getOperand(1);
1813 std::optional<unsigned> Opcode;
1814 SDLoc DL(N);
1815 SDNode *ST;
1816 EVT EltVT = Op1.getValueType();
1817 MemSDNode *MemSD = cast<MemSDNode>(N);
1818 EVT StoreVT = MemSD->getMemoryVT();
1819
1820 // Address Space Setting
1821 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1822 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1823 report_fatal_error("Cannot store to pointer that points to constant "
1824 "memory space");
1825 }
1826 unsigned int PointerSize =
1828
1829 // Volatile Setting
1830 // - .volatile is only availalble for .global and .shared
1831 bool IsVolatile = MemSD->isVolatile();
1832 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1833 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1834 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1835 IsVolatile = false;
1836
1837 // Type Setting: toType + toTypeWidth
1838 // - for integer type, always use 'u'
1839 assert(StoreVT.isSimple() && "Store value is not simple");
1840 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1841 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1842 unsigned ToType = getLdStRegType(ScalarVT);
1843
1845 SDValue N2;
1846 unsigned VecType;
1847
1848 switch (N->getOpcode()) {
1849 case NVPTXISD::StoreV2:
1851 StOps.push_back(N->getOperand(1));
1852 StOps.push_back(N->getOperand(2));
1853 N2 = N->getOperand(3);
1854 break;
1855 case NVPTXISD::StoreV4:
1857 StOps.push_back(N->getOperand(1));
1858 StOps.push_back(N->getOperand(2));
1859 StOps.push_back(N->getOperand(3));
1860 StOps.push_back(N->getOperand(4));
1861 N2 = N->getOperand(5);
1862 break;
1863 default:
1864 return false;
1865 }
1866
1867 // v8x16 is a special case. PTX doesn't have st.v8.x16
1868 // instruction. Instead, we split the vector into v2x16 chunks and
1869 // store them with st.v4.b32.
1870 if (Isv2x16VT(EltVT)) {
1871 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1872 EltVT = MVT::i32;
1874 ToTypeWidth = 32;
1875 }
1876
1877 StOps.push_back(getI32Imm(IsVolatile, DL));
1878 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1879 StOps.push_back(getI32Imm(VecType, DL));
1880 StOps.push_back(getI32Imm(ToType, DL));
1881 StOps.push_back(getI32Imm(ToTypeWidth, DL));
1882
1883 if (SelectDirectAddr(N2, Addr)) {
1884 switch (N->getOpcode()) {
1885 default:
1886 return false;
1887 case NVPTXISD::StoreV2:
1889 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1890 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1891 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1892 break;
1893 case NVPTXISD::StoreV4:
1895 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1896 NVPTX::STV_i32_v4_avar, std::nullopt,
1897 NVPTX::STV_f32_v4_avar, std::nullopt);
1898 break;
1899 }
1900 StOps.push_back(Addr);
1901 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1902 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1903 switch (N->getOpcode()) {
1904 default:
1905 return false;
1906 case NVPTXISD::StoreV2:
1908 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1909 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1910 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1911 break;
1912 case NVPTXISD::StoreV4:
1913 Opcode =
1914 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1915 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1916 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1917 break;
1918 }
1919 StOps.push_back(Base);
1920 StOps.push_back(Offset);
1921 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1922 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1923 if (PointerSize == 64) {
1924 switch (N->getOpcode()) {
1925 default:
1926 return false;
1927 case NVPTXISD::StoreV2:
1928 Opcode =
1930 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1931 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1932 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1933 break;
1934 case NVPTXISD::StoreV4:
1936 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1937 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1938 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1939 break;
1940 }
1941 } else {
1942 switch (N->getOpcode()) {
1943 default:
1944 return false;
1945 case NVPTXISD::StoreV2:
1947 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1948 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1949 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1950 break;
1951 case NVPTXISD::StoreV4:
1953 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1954 NVPTX::STV_i32_v4_ari, std::nullopt,
1955 NVPTX::STV_f32_v4_ari, std::nullopt);
1956 break;
1957 }
1958 }
1959 StOps.push_back(Base);
1960 StOps.push_back(Offset);
1961 } else {
1962 if (PointerSize == 64) {
1963 switch (N->getOpcode()) {
1964 default:
1965 return false;
1966 case NVPTXISD::StoreV2:
1968 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1969 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1970 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1971 NVPTX::STV_f64_v2_areg_64);
1972 break;
1973 case NVPTXISD::StoreV4:
1975 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1976 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1977 NVPTX::STV_f32_v4_areg_64, std::nullopt);
1978 break;
1979 }
1980 } else {
1981 switch (N->getOpcode()) {
1982 default:
1983 return false;
1984 case NVPTXISD::StoreV2:
1985 Opcode =
1986 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
1987 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1988 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1989 NVPTX::STV_f64_v2_areg);
1990 break;
1991 case NVPTXISD::StoreV4:
1992 Opcode =
1993 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
1994 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1995 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1996 break;
1997 }
1998 }
1999 StOps.push_back(N2);
2000 }
2001
2002 if (!Opcode)
2003 return false;
2004
2005 StOps.push_back(Chain);
2006
2007 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2008
2009 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2010 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2011
2012 ReplaceNode(N, ST);
2013 return true;
2014}
2015
2016bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2017 SDValue Chain = Node->getOperand(0);
2018 SDValue Offset = Node->getOperand(2);
2019 SDValue Glue = Node->getOperand(3);
2020 SDLoc DL(Node);
2021 MemSDNode *Mem = cast<MemSDNode>(Node);
2022
2023 unsigned VecSize;
2024 switch (Node->getOpcode()) {
2025 default:
2026 return false;
2028 VecSize = 1;
2029 break;
2031 VecSize = 2;
2032 break;
2034 VecSize = 4;
2035 break;
2036 }
2037
2038 EVT EltVT = Node->getValueType(0);
2039 EVT MemVT = Mem->getMemoryVT();
2040
2041 std::optional<unsigned> Opcode;
2042
2043 switch (VecSize) {
2044 default:
2045 return false;
2046 case 1:
2048 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2049 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2050 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2051 break;
2052 case 2:
2053 Opcode =
2054 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2055 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2056 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2057 NVPTX::LoadParamMemV2F64);
2058 break;
2059 case 4:
2060 Opcode =
2061 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2062 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2063 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2064 break;
2065 }
2066 if (!Opcode)
2067 return false;
2068
2069 SDVTList VTs;
2070 if (VecSize == 1) {
2071 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2072 } else if (VecSize == 2) {
2073 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2074 } else {
2075 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2076 VTs = CurDAG->getVTList(EVTs);
2077 }
2078
2079 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2080
2082 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2083 Ops.push_back(Chain);
2084 Ops.push_back(Glue);
2085
2087 return true;
2088}
2089
2090bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2091 SDLoc DL(N);
2092 SDValue Chain = N->getOperand(0);
2093 SDValue Offset = N->getOperand(1);
2094 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2095 MemSDNode *Mem = cast<MemSDNode>(N);
2096
2097 // How many elements do we have?
2098 unsigned NumElts = 1;
2099 switch (N->getOpcode()) {
2100 default:
2101 return false;
2103 NumElts = 1;
2104 break;
2106 NumElts = 2;
2107 break;
2109 NumElts = 4;
2110 break;
2111 }
2112
2113 // Build vector of operands
2115 for (unsigned i = 0; i < NumElts; ++i)
2116 Ops.push_back(N->getOperand(i + 2));
2117 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2118 Ops.push_back(Chain);
2119
2120 // Determine target opcode
2121 // If we have an i1, use an 8-bit store. The lowering code in
2122 // NVPTXISelLowering will have already emitted an upcast.
2123 std::optional<unsigned> Opcode = 0;
2124 switch (NumElts) {
2125 default:
2126 return false;
2127 case 1:
2129 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2130 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2131 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2132 break;
2133 case 2:
2135 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2136 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2137 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2138 break;
2139 case 4:
2141 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2142 NVPTX::StoreRetvalV4I32, std::nullopt,
2143 NVPTX::StoreRetvalV4F32, std::nullopt);
2144 break;
2145 }
2146 if (!Opcode)
2147 return false;
2148
2149 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2150 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2151 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2152
2153 ReplaceNode(N, Ret);
2154 return true;
2155}
2156
2157bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2158 SDLoc DL(N);
2159 SDValue Chain = N->getOperand(0);
2160 SDValue Param = N->getOperand(1);
2161 unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2162 SDValue Offset = N->getOperand(2);
2163 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2164 MemSDNode *Mem = cast<MemSDNode>(N);
2165 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2166
2167 // How many elements do we have?
2168 unsigned NumElts = 1;
2169 switch (N->getOpcode()) {
2170 default:
2171 return false;
2175 NumElts = 1;
2176 break;
2178 NumElts = 2;
2179 break;
2181 NumElts = 4;
2182 break;
2183 }
2184
2185 // Build vector of operands
2187 for (unsigned i = 0; i < NumElts; ++i)
2188 Ops.push_back(N->getOperand(i + 3));
2189 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2190 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2191 Ops.push_back(Chain);
2192 Ops.push_back(Glue);
2193
2194 // Determine target opcode
2195 // If we have an i1, use an 8-bit store. The lowering code in
2196 // NVPTXISelLowering will have already emitted an upcast.
2197 std::optional<unsigned> Opcode = 0;
2198 switch (N->getOpcode()) {
2199 default:
2200 switch (NumElts) {
2201 default:
2202 return false;
2203 case 1:
2205 NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2206 NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2207 NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2208 break;
2209 case 2:
2211 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2212 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2213 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2214 break;
2215 case 4:
2217 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2218 NVPTX::StoreParamV4I32, std::nullopt,
2219 NVPTX::StoreParamV4F32, std::nullopt);
2220 break;
2221 }
2222 if (!Opcode)
2223 return false;
2224 break;
2225 // Special case: if we have a sign-extend/zero-extend node, insert the
2226 // conversion instruction first, and use that as the value operand to
2227 // the selected StoreParam node.
2229 Opcode = NVPTX::StoreParamI32;
2231 MVT::i32);
2232 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2233 MVT::i32, Ops[0], CvtNone);
2234 Ops[0] = SDValue(Cvt, 0);
2235 break;
2236 }
2238 Opcode = NVPTX::StoreParamI32;
2240 MVT::i32);
2241 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2242 MVT::i32, Ops[0], CvtNone);
2243 Ops[0] = SDValue(Cvt, 0);
2244 break;
2245 }
2246 }
2247
2248 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2249 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2250 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2251 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2252
2253 ReplaceNode(N, Ret);
2254 return true;
2255}
2256
2257bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2258 unsigned Opc = 0;
2259
2260 switch (N->getOpcode()) {
2261 default: return false;
2263 Opc = NVPTX::TEX_1D_F32_S32_RR;
2264 break;
2266 Opc = NVPTX::TEX_1D_F32_F32_RR;
2267 break;
2269 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2270 break;
2272 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2273 break;
2275 Opc = NVPTX::TEX_1D_S32_S32_RR;
2276 break;
2278 Opc = NVPTX::TEX_1D_S32_F32_RR;
2279 break;
2281 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2282 break;
2284 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2285 break;
2287 Opc = NVPTX::TEX_1D_U32_S32_RR;
2288 break;
2290 Opc = NVPTX::TEX_1D_U32_F32_RR;
2291 break;
2293 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2294 break;
2296 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2297 break;
2299 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2300 break;
2302 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2303 break;
2305 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2306 break;
2308 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2309 break;
2311 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2312 break;
2314 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2315 break;
2317 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2318 break;
2320 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2321 break;
2323 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2324 break;
2326 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2327 break;
2329 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2330 break;
2332 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2333 break;
2335 Opc = NVPTX::TEX_2D_F32_S32_RR;
2336 break;
2338 Opc = NVPTX::TEX_2D_F32_F32_RR;
2339 break;
2341 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2342 break;
2344 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2345 break;
2347 Opc = NVPTX::TEX_2D_S32_S32_RR;
2348 break;
2350 Opc = NVPTX::TEX_2D_S32_F32_RR;
2351 break;
2353 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2354 break;
2356 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2357 break;
2359 Opc = NVPTX::TEX_2D_U32_S32_RR;
2360 break;
2362 Opc = NVPTX::TEX_2D_U32_F32_RR;
2363 break;
2365 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2366 break;
2368 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2369 break;
2371 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2372 break;
2374 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2375 break;
2377 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2378 break;
2380 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2381 break;
2383 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2384 break;
2386 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2387 break;
2389 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2390 break;
2392 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2393 break;
2395 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2396 break;
2398 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2399 break;
2401 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2402 break;
2404 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2405 break;
2407 Opc = NVPTX::TEX_3D_F32_S32_RR;
2408 break;
2410 Opc = NVPTX::TEX_3D_F32_F32_RR;
2411 break;
2413 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2414 break;
2416 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2417 break;
2419 Opc = NVPTX::TEX_3D_S32_S32_RR;
2420 break;
2422 Opc = NVPTX::TEX_3D_S32_F32_RR;
2423 break;
2425 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2426 break;
2428 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2429 break;
2431 Opc = NVPTX::TEX_3D_U32_S32_RR;
2432 break;
2434 Opc = NVPTX::TEX_3D_U32_F32_RR;
2435 break;
2437 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2438 break;
2440 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2441 break;
2443 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2444 break;
2446 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2447 break;
2449 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2450 break;
2452 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2453 break;
2455 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2456 break;
2458 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2459 break;
2461 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2462 break;
2464 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2465 break;
2467 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2468 break;
2470 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2471 break;
2473 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2474 break;
2476 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2477 break;
2479 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2480 break;
2482 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2483 break;
2485 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2486 break;
2488 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2489 break;
2491 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2492 break;
2494 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2495 break;
2497 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2498 break;
2500 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2501 break;
2503 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2504 break;
2506 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2507 break;
2509 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2510 break;
2512 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2513 break;
2515 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2516 break;
2518 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2519 break;
2521 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2522 break;
2524 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2525 break;
2527 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2528 break;
2530 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2531 break;
2533 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2534 break;
2536 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2537 break;
2539 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2540 break;
2542 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2543 break;
2545 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2546 break;
2548 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2549 break;
2551 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2552 break;
2554 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2555 break;
2557 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2558 break;
2560 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2561 break;
2563 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2564 break;
2566 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2567 break;
2569 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2570 break;
2572 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2573 break;
2575 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2576 break;
2578 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2579 break;
2581 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2582 break;
2584 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2585 break;
2587 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2588 break;
2590 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2591 break;
2593 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2594 break;
2596 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2597 break;
2599 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2600 break;
2602 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2603 break;
2605 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2606 break;
2608 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2609 break;
2611 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2612 break;
2614 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2615 break;
2617 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2618 break;
2620 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2621 break;
2623 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2624 break;
2626 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2627 break;
2629 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2630 break;
2632 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2633 break;
2635 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2636 break;
2638 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2639 break;
2641 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2642 break;
2644 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2645 break;
2647 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2648 break;
2650 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2651 break;
2653 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2654 break;
2656 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2657 break;
2659 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2660 break;
2662 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2663 break;
2665 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2666 break;
2668 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2669 break;
2671 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2672 break;
2674 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2675 break;
2677 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2678 break;
2680 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2681 break;
2683 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2684 break;
2686 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2687 break;
2689 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2690 break;
2692 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2693 break;
2695 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2696 break;
2698 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2699 break;
2701 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2702 break;
2704 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2705 break;
2707 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2708 break;
2710 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2711 break;
2713 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2714 break;
2716 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2717 break;
2719 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2720 break;
2722 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2723 break;
2725 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2726 break;
2728 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2729 break;
2731 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2732 break;
2734 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2735 break;
2737 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2738 break;
2740 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2741 break;
2743 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2744 break;
2746 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2747 break;
2749 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2750 break;
2752 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2753 break;
2755 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2756 break;
2758 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2759 break;
2761 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2762 break;
2764 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2765 break;
2766 }
2767
2768 // Copy over operands
2770 Ops.push_back(N->getOperand(0)); // Move chain to the back.
2771
2772 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2773 return true;
2774}
2775
2776bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2777 unsigned Opc = 0;
2778 switch (N->getOpcode()) {
2779 default: return false;
2781 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2782 break;
2784 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2785 break;
2787 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2788 break;
2790 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2791 break;
2793 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2794 break;
2796 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2797 break;
2799 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2800 break;
2802 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2803 break;
2805 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2806 break;
2808 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2809 break;
2811 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2812 break;
2814 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2815 break;
2817 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2818 break;
2820 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2821 break;
2823 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2824 break;
2826 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2827 break;
2829 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2830 break;
2832 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2833 break;
2835 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2836 break;
2838 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2839 break;
2841 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2842 break;
2844 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2845 break;
2847 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2848 break;
2850 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2851 break;
2853 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2854 break;
2856 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2857 break;
2859 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2860 break;
2862 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2863 break;
2865 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2866 break;
2868 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2869 break;
2871 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2872 break;
2874 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2875 break;
2877 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2878 break;
2880 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2881 break;
2883 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2884 break;
2886 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2887 break;
2889 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2890 break;
2892 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2893 break;
2895 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2896 break;
2898 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2899 break;
2901 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2902 break;
2904 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2905 break;
2907 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2908 break;
2910 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2911 break;
2913 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
2914 break;
2916 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
2917 break;
2919 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
2920 break;
2922 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
2923 break;
2925 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
2926 break;
2928 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
2929 break;
2931 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
2932 break;
2934 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
2935 break;
2937 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
2938 break;
2940 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
2941 break;
2943 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
2944 break;
2946 Opc = NVPTX::SULD_1D_I8_TRAP_R;
2947 break;
2949 Opc = NVPTX::SULD_1D_I16_TRAP_R;
2950 break;
2952 Opc = NVPTX::SULD_1D_I32_TRAP_R;
2953 break;
2955 Opc = NVPTX::SULD_1D_I64_TRAP_R;
2956 break;
2958 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
2959 break;
2961 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
2962 break;
2964 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
2965 break;
2967 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
2968 break;
2970 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
2971 break;
2973 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
2974 break;
2976 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
2977 break;
2979 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
2980 break;
2982 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
2983 break;
2985 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
2986 break;
2988 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
2989 break;
2991 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
2992 break;
2994 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
2995 break;
2997 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
2998 break;
3000 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3001 break;
3003 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3004 break;
3006 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3007 break;
3009 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3010 break;
3012 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3013 break;
3015 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3016 break;
3018 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3019 break;
3021 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3022 break;
3024 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3025 break;
3027 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3028 break;
3030 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3031 break;
3033 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3034 break;
3036 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3037 break;
3039 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3040 break;
3042 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3043 break;
3045 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3046 break;
3048 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3049 break;
3051 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3052 break;
3054 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3055 break;
3057 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3058 break;
3060 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3061 break;
3063 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3064 break;
3066 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3067 break;
3069 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3070 break;
3072 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3073 break;
3075 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3076 break;
3078 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3079 break;
3081 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3082 break;
3084 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3085 break;
3087 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3088 break;
3090 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3091 break;
3093 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3094 break;
3096 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3097 break;
3099 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3100 break;
3102 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3103 break;
3105 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3106 break;
3108 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3109 break;
3111 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3112 break;
3114 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3115 break;
3117 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3118 break;
3120 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3121 break;
3123 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3124 break;
3126 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3127 break;
3129 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3130 break;
3132 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3133 break;
3135 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3136 break;
3138 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3139 break;
3141 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3142 break;
3144 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3145 break;
3147 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3148 break;
3150 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3151 break;
3153 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3154 break;
3156 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3157 break;
3159 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3160 break;
3162 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3163 break;
3165 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3166 break;
3168 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3169 break;
3171 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3172 break;
3174 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3175 break;
3177 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3178 break;
3180 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3181 break;
3183 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3184 break;
3186 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3187 break;
3189 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3190 break;
3192 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3193 break;
3195 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3196 break;
3198 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3199 break;
3201 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3202 break;
3204 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3205 break;
3207 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3208 break;
3210 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3211 break;
3213 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3214 break;
3216 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3217 break;
3219 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3220 break;
3222 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3223 break;
3225 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3226 break;
3228 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3229 break;
3231 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3232 break;
3234 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3235 break;
3237 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3238 break;
3240 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3241 break;
3243 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3244 break;
3246 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3247 break;
3249 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3250 break;
3252 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3253 break;
3255 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3256 break;
3258 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3259 break;
3261 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3262 break;
3264 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3265 break;
3267 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3268 break;
3270 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3271 break;
3273 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3274 break;
3275 }
3276
3277 // Copy over operands
3279 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3280
3281 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3282 return true;
3283}
3284
3285
3286/// SelectBFE - Look for instruction sequences that can be made more efficient
3287/// by using the 'bfe' (bit-field extract) PTX instruction
3288bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3289 SDLoc DL(N);
3290 SDValue LHS = N->getOperand(0);
3291 SDValue RHS = N->getOperand(1);
3292 SDValue Len;
3293 SDValue Start;
3294 SDValue Val;
3295 bool IsSigned = false;
3296
3297 if (N->getOpcode() == ISD::AND) {
3298 // Canonicalize the operands
3299 // We want 'and %val, %mask'
3300 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3301 std::swap(LHS, RHS);
3302 }
3303
3304 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3305 if (!Mask) {
3306 // We need a constant mask on the RHS of the AND
3307 return false;
3308 }
3309
3310 // Extract the mask bits
3311 uint64_t MaskVal = Mask->getZExtValue();
3312 if (!isMask_64(MaskVal)) {
3313 // We *could* handle shifted masks here, but doing so would require an
3314 // 'and' operation to fix up the low-order bits so we would trade
3315 // shr+and for bfe+and, which has the same throughput
3316 return false;
3317 }
3318
3319 // How many bits are in our mask?
3320 int64_t NumBits = countr_one(MaskVal);
3321 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3322
3323 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3324 // We have a 'srl/and' pair, extract the effective start bit and length
3325 Val = LHS.getNode()->getOperand(0);
3326 Start = LHS.getNode()->getOperand(1);
3327 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3328 if (StartConst) {
3329 uint64_t StartVal = StartConst->getZExtValue();
3330 // How many "good" bits do we have left? "good" is defined here as bits
3331 // that exist in the original value, not shifted in.
3332 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3333 if (NumBits > GoodBits) {
3334 // Do not handle the case where bits have been shifted in. In theory
3335 // we could handle this, but the cost is likely higher than just
3336 // emitting the srl/and pair.
3337 return false;
3338 }
3339 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3340 } else {
3341 // Do not handle the case where the shift amount (can be zero if no srl
3342 // was found) is not constant. We could handle this case, but it would
3343 // require run-time logic that would be more expensive than just
3344 // emitting the srl/and pair.
3345 return false;
3346 }
3347 } else {
3348 // Do not handle the case where the LHS of the and is not a shift. While
3349 // it would be trivial to handle this case, it would just transform
3350 // 'and' -> 'bfe', but 'and' has higher-throughput.
3351 return false;
3352 }
3353 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3354 if (LHS->getOpcode() == ISD::AND) {
3355 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3356 if (!ShiftCnst) {
3357 // Shift amount must be constant
3358 return false;
3359 }
3360
3361 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3362
3363 SDValue AndLHS = LHS->getOperand(0);
3364 SDValue AndRHS = LHS->getOperand(1);
3365
3366 // Canonicalize the AND to have the mask on the RHS
3367 if (isa<ConstantSDNode>(AndLHS)) {
3368 std::swap(AndLHS, AndRHS);
3369 }
3370
3371 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3372 if (!MaskCnst) {
3373 // Mask must be constant
3374 return false;
3375 }
3376
3377 uint64_t MaskVal = MaskCnst->getZExtValue();
3378 uint64_t NumZeros;
3379 uint64_t NumBits;
3380 if (isMask_64(MaskVal)) {
3381 NumZeros = 0;
3382 // The number of bits in the result bitfield will be the number of
3383 // trailing ones (the AND) minus the number of bits we shift off
3384 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3385 } else if (isShiftedMask_64(MaskVal)) {
3386 NumZeros = llvm::countr_zero(MaskVal);
3387 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3388 // The number of bits in the result bitfield will be the number of
3389 // trailing zeros plus the number of set bits in the mask minus the
3390 // number of bits we shift off
3391 NumBits = NumZeros + NumOnes - ShiftAmt;
3392 } else {
3393 // This is not a mask we can handle
3394 return false;
3395 }
3396
3397 if (ShiftAmt < NumZeros) {
3398 // Handling this case would require extra logic that would make this
3399 // transformation non-profitable
3400 return false;
3401 }
3402
3403 Val = AndLHS;
3404 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3405 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3406 } else if (LHS->getOpcode() == ISD::SHL) {
3407 // Here, we have a pattern like:
3408 //
3409 // (sra (shl val, NN), MM)
3410 // or
3411 // (srl (shl val, NN), MM)
3412 //
3413 // If MM >= NN, we can efficiently optimize this with bfe
3414 Val = LHS->getOperand(0);
3415
3416 SDValue ShlRHS = LHS->getOperand(1);
3417 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3418 if (!ShlCnst) {
3419 // Shift amount must be constant
3420 return false;
3421 }
3422 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3423
3424 SDValue ShrRHS = RHS;
3425 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3426 if (!ShrCnst) {
3427 // Shift amount must be constant
3428 return false;
3429 }
3430 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3431
3432 // To avoid extra codegen and be profitable, we need Outer >= Inner
3433 if (OuterShiftAmt < InnerShiftAmt) {
3434 return false;
3435 }
3436
3437 // If the outer shift is more than the type size, we have no bitfield to
3438 // extract (since we also check that the inner shift is <= the outer shift
3439 // then this also implies that the inner shift is < the type size)
3440 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3441 return false;
3442 }
3443
3444 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3445 MVT::i32);
3446 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3447 DL, MVT::i32);
3448
3449 if (N->getOpcode() == ISD::SRA) {
3450 // If we have a arithmetic right shift, we need to use the signed bfe
3451 // variant
3452 IsSigned = true;
3453 }
3454 } else {
3455 // No can do...
3456 return false;
3457 }
3458 } else {
3459 // No can do...
3460 return false;
3461 }
3462
3463
3464 unsigned Opc;
3465 // For the BFE operations we form here from "and" and "srl", always use the
3466 // unsigned variants.
3467 if (Val.getValueType() == MVT::i32) {
3468 if (IsSigned) {
3469 Opc = NVPTX::BFE_S32rii;
3470 } else {
3471 Opc = NVPTX::BFE_U32rii;
3472 }
3473 } else if (Val.getValueType() == MVT::i64) {
3474 if (IsSigned) {
3475 Opc = NVPTX::BFE_S64rii;
3476 } else {
3477 Opc = NVPTX::BFE_U64rii;
3478 }
3479 } else {
3480 // We cannot handle this type
3481 return false;
3482 }
3483
3484 SDValue Ops[] = {
3485 Val, Start, Len
3486 };
3487
3488 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3489 return true;
3490}
3491
3492// SelectDirectAddr - Match a direct address for DAG.
3493// A direct address could be a globaladdress or externalsymbol.
3494bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3495 // Return true if TGA or ES.
3496 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3497 N.getOpcode() == ISD::TargetExternalSymbol) {
3498 Address = N;
3499 return true;
3500 }
3501 if (N.getOpcode() == NVPTXISD::Wrapper) {
3502 Address = N.getOperand(0);
3503 return true;
3504 }
3505 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3506 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3507 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3510 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3511 }
3512 return false;
3513}
3514
3515// symbol+offset
3516bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3517 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3518 if (Addr.getOpcode() == ISD::ADD) {
3519 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3520 SDValue base = Addr.getOperand(0);
3521 if (SelectDirectAddr(base, Base)) {
3522 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3523 mvt);
3524 return true;
3525 }
3526 }
3527 }
3528 return false;
3529}
3530
3531// symbol+offset
3532bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3534 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3535}
3536
3537// symbol+offset
3538bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3540 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3541}
3542
3543// register+offset
3544bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3545 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3546 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3547 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3548 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3549 return true;
3550 }
3551 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3552 Addr.getOpcode() == ISD::TargetGlobalAddress)
3553 return false; // direct calls.
3554
3555 if (Addr.getOpcode() == ISD::ADD) {
3556 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3557 return false;
3558 }
3559 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3560 if (FrameIndexSDNode *FIN =
3561 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3562 // Constant offset from frame ref.
3563 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3564 else
3565 Base = Addr.getOperand(0);
3566 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3567 mvt);
3568 return true;
3569 }
3570 }
3571 return false;
3572}
3573
3574// register+offset
3575bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3577 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3578}
3579
3580// register+offset
3581bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3583 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3584}
3585
3586bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3587 unsigned int spN) const {
3588 const Value *Src = nullptr;
3589 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3590 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3591 return true;
3592 Src = mN->getMemOperand()->getValue();
3593 }
3594 if (!Src)
3595 return false;
3596 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3597 return (PT->getAddressSpace() == spN);
3598 return false;
3599}
3600
3601/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3602/// inline asm expressions.
3604 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3605 std::vector<SDValue> &OutOps) {
3606 SDValue Op0, Op1;
3607 switch (ConstraintID) {
3608 default:
3609 return true;
3610 case InlineAsm::ConstraintCode::m: // memory
3611 if (SelectDirectAddr(Op, Op0)) {
3612 OutOps.push_back(Op0);
3613 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3614 return false;
3615 }
3616 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3617 OutOps.push_back(Op0);
3618 OutOps.push_back(Op1);
3619 return false;
3620 }
3621 break;
3622 }
3623 return true;
3624}
3625
3626/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3627/// conversion from \p SrcTy to \p DestTy.
3628unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3629 LoadSDNode *LdNode) {
3630 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
3631 switch (SrcTy.SimpleTy) {
3632 default:
3633 llvm_unreachable("Unhandled source type");
3634 case MVT::i8:
3635 switch (DestTy.SimpleTy) {
3636 default:
3637 llvm_unreachable("Unhandled dest type");
3638 case MVT::i16:
3639 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3640 case MVT::i32:
3641 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3642 case MVT::i64:
3643 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3644 }
3645 case MVT::i16:
3646 switch (DestTy.SimpleTy) {
3647 default:
3648 llvm_unreachable("Unhandled dest type");
3649 case MVT::i8:
3650 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3651 case MVT::i32:
3652 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3653 case MVT::i64:
3654 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3655 }
3656 case MVT::i32:
3657 switch (DestTy.SimpleTy) {
3658 default:
3659 llvm_unreachable("Unhandled dest type");
3660 case MVT::i8:
3661 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3662 case MVT::i16:
3663 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3664 case MVT::i64:
3665 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3666 }
3667 case MVT::i64:
3668 switch (DestTy.SimpleTy) {
3669 default:
3670 llvm_unreachable("Unhandled dest type");
3671 case MVT::i8:
3672 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3673 case MVT::i16:
3674 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3675 case MVT::i32:
3676 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3677 }
3678 case MVT::f16:
3679 switch (DestTy.SimpleTy) {
3680 default:
3681 llvm_unreachable("Unhandled dest type");
3682 case MVT::f32:
3683 return NVPTX::CVT_f32_f16;
3684 case MVT::f64:
3685 return NVPTX::CVT_f64_f16;
3686 }
3687 }
3688}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
uint64_t Addr
#define F(x, y, z)
Definition: MD5.cpp:55
static unsigned int getCodeAddrSpace(MemSDNode *N)
static int getLdStRegType(EVT VT)
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
#define PASS_NAME
#define DEBUG_TYPE
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Value * RHS
Value * LHS
static constexpr uint32_t Opcode
Definition: aarch32.h:200
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
bool isVolatile() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:725
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:706
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:674
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:81
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1233
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1029
@ TargetExternalSymbol
Definition: ISDOpcodes.h:169
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1229
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:164
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:907
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1506