47#include "llvm/IR/IntrinsicsNVPTX.h"
70#define DEBUG_TYPE "nvptx-lower"
82 cl::desc(
"NVPTX Specific: FMA contraction (0: don't do it"
83 " 1: do it 2: do it aggressively"),
88 cl::desc(
"NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
89 " IEEE Compliant F32 div.rnd if available."),
94 cl::desc(
"NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
98 "nvptx-force-min-byval-param-align",
cl::Hidden,
99 cl::desc(
"NVPTX Specific: force 4-byte minimal alignment for byval"
100 " params of device functions."),
182 Offsets->push_back(StartingOffset + 0);
183 Offsets->push_back(StartingOffset + 8);
190 if (
StructType *STy = dyn_cast<StructType>(Ty)) {
191 auto const *SL =
DL.getStructLayout(STy);
193 for(
auto *EI : STy->elements()) {
195 StartingOffset + SL->getElementOffset(ElementNum));
202 for (
unsigned i = 0, e = TempVTs.
size(); i != e; ++i) {
229 (NumElts % 4 == 0 || NumElts == 3)) {
232 NumElts = (NumElts + 3) / 4;
234 for (
unsigned j = 0; j != NumElts; ++j) {
242 Offsets->push_back(Off);
257 "Promotion is not suitable for scalars of size larger than 64-bits");
259 *PromotedVT = MVT::i1;
264 *PromotedVT = MVT::i8;
267 *PromotedVT = MVT::i16;
270 *PromotedVT = MVT::i32;
273 *PromotedVT = MVT::i64;
276 return EVT(*PromotedVT) != VT;
296 if (ParamAlignment < AccessSize)
299 if (Offsets[
Idx] & (AccessSize - 1))
302 EVT EltVT = ValueVTs[
Idx];
306 if (EltSize >= AccessSize)
309 unsigned NumElts = AccessSize / EltSize;
311 if (AccessSize != EltSize * NumElts)
315 if (
Idx + NumElts > ValueVTs.
size())
319 if (NumElts != 4 && NumElts != 2)
322 for (
unsigned j =
Idx + 1; j <
Idx + NumElts; ++j) {
324 if (ValueVTs[j] != EltVT)
328 if (Offsets[j] - Offsets[j - 1] != EltSize)
356 Align ParamAlignment,
bool IsVAArg =
false) {
366 for (
int I = 0, E = ValueVTs.
size();
I != E; ++
I) {
369 for (
unsigned AccessSize : {16, 8, 4, 2}) {
371 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
380 assert(
I + 1 < E &&
"Not enough elements.");
386 assert(
I + 3 < E &&
"Not enough elements.");
457 Op, VT, IsOpSupported ? Action : NoBF16Action);
462 bool IsOpSupported =
false;
544 for (
MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
545 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
546 MVT::i32, MVT::i64}) {
670 for (
const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
752 const bool IsFP16FP16x2NegAvailable = STI.
getSmVersion() >= 53 &&
755 for (
const auto &VT : {MVT::f16, MVT::v2f16})
780 for (
MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
789 for (
MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
818 for (
const auto &
Op :
842 return IsAtLeastSm80 ?
Legal : NotSm80Action;
848 setFP16OperationAction(
Op, MVT::v2f16, GetMinMaxAction(
Expand),
Expand);
855 setFP16OperationAction(
Op, MVT::f16, GetMinMaxAction(
Expand),
Expand);
858 setFP16OperationAction(
Op, MVT::v2f16, GetMinMaxAction(
Expand),
Expand);
880#define MAKE_CASE(V) \
1324 bool Reciprocal)
const {
1345 if (Reciprocal || ExtraSteps > 0) {
1347 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1348 : Intrinsic::nvvm_rsqrt_approx_f);
1349 else if (VT == MVT::f64)
1350 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1355 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1356 : Intrinsic::nvvm_sqrt_approx_f);
1364 DAG.
getConstant(Intrinsic::nvvm_rcp_approx_ftz_d,
DL, MVT::i32),
1365 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1387 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1388 const CallBase &CB,
unsigned UniqueCallSite)
const {
1392 assert(isABI &&
"Non-ABI compilation is not supported");
1396 std::string Prototype;
1398 O <<
"prototype_" << UniqueCallSite <<
" : .callprototype ";
1407 if (
auto *ITy = dyn_cast<IntegerType>(retTy)) {
1408 size = ITy->getBitWidth();
1411 "Floating point type expected here");
1419 O <<
".param .b" <<
size <<
" _";
1420 }
else if (isa<PointerType>(retTy)) {
1421 O <<
".param .b" << PtrVT.getSizeInBits() <<
" _";
1423 O <<
".param .align " << (retAlignment ? retAlignment->value() : 0)
1424 <<
" .b8 _[" <<
DL.getTypeAllocSize(retTy) <<
"]";
1434 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1435 for (
unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1436 Type *Ty = Args[i].Ty;
1442 if (!Outs[OIdx].Flags.isByVal()) {
1446 O <<
".param .align " << ParamAlign.
value() <<
" .b8 ";
1448 O <<
"[" <<
DL.getTypeAllocSize(Ty) <<
"]";
1452 if (
unsigned len = vtparts.
size())
1458 (
getValueType(
DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1459 "type mismatch between callee prototype and arguments");
1462 if (isa<IntegerType>(Ty)) {
1463 sz = cast<IntegerType>(Ty)->getBitWidth();
1465 }
else if (isa<PointerType>(Ty)) {
1466 sz = PtrVT.getSizeInBits();
1470 O <<
".param .b" << sz <<
" ";
1477 Type *ETy = Args[i].IndirectType;
1478 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1479 Align ParamByValAlign =
1482 O <<
".param .align " << ParamByValAlign.
value() <<
" .b8 ";
1484 O <<
"[" << Outs[OIdx].Flags.getByValSize() <<
"]";
1488 O << (first ?
"" :
",") <<
" .param .align " << VAInfo->second
1508 return DL.getABITypeAlign(Ty);
1513 if (!DirectCallee) {
1518 if (
const auto *CI = dyn_cast<CallInst>(CB)) {
1521 return StackAlign.value();
1532 return DL.getABITypeAlign(Ty);
1536 switch (ElementType.getSimpleVT().SimpleTy) {
1541 ElementType = MVT::i16;
1546 ElementType = MVT::i32;
1549 ElementType = MVT::i64;
1561 unsigned ArgID,
const SDLoc &dl) {
1568 for (
unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1594 EVT MergedType = ElementType;
1601 for (
unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1628 if (ElementType != MergedType)
1639 "Support for variadic functions (unsized array parameter) introduced "
1640 "in PTX ISA version 6.0 and requires target sm_30.");
1656 assert(isABI &&
"Non-ABI compilation is not supported");
1678 unsigned VAOffset = 0;
1685 unsigned ParamCount = 0;
1698 for (
unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1699 EVT VT = Outs[OIdx].VT;
1700 Type *Ty = Args[i].Ty;
1702 bool IsByVal = Outs[OIdx].Flags.isByVal();
1707 assert((!IsByVal || Args[i].IndirectType) &&
1708 "byval arg must have indirect type");
1709 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1717 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1721 VAOffset =
alignTo(VAOffset, ArgAlign);
1723 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1,
DL);
1727 (IsByVal ? Outs[OIdx].Flags.getByValSize() :
DL.getTypeAllocSize(Ty));
1733 if (ParamCount == FirstVAArg) {
1739 DeclareParamVTs, DeclareParamOps);
1741 NeedAlign = PassAsArray;
1742 }
else if (PassAsArray) {
1759 SDValue DeclareScalarParamOps[] = {
1764 DeclareScalarParamOps);
1773 bool ExtendIntegerParam =
1778 for (
unsigned j = 0, je = VTs.
size(); j != je; ++j) {
1780 int CurOffset = Offsets[j];
1785 SDValue StVal = OutVals[OIdx];
1789 EltVT =
EVT(PromotedVT);
1794 StVal = DAG.
getNode(Ext, dl, PromotedVT, StVal);
1803 }
else if (ExtendIntegerParam) {
1804 assert(VTs.
size() == 1 &&
"Scalar can't have multiple parts.");
1808 dl, MVT::i32, StVal);
1819 if (VectorInfo[j] ==
PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1822 assert(StoreOperands.
empty() &&
"Unfinished preceeding store.");
1824 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1825 StVal, InGlue, ParamCount, dl);
1836 assert(StoreOperands.
empty() &&
"Unfinished preceding store.");
1839 DAG.
getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1842 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1850 unsigned NumElts = StoreOperands.
size() - 3;
1870 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1873 Op, dl, DAG.
getVTList(MVT::Other, MVT::Glue), StoreOperands,
1879 StoreOperands.
clear();
1883 if (!IsByVal && IsVAArg) {
1885 "Vectorization is expected to be disabled for variadics.");
1886 VAOffset +=
DL.getTypeAllocSize(
1893 assert(StoreOperands.
empty() &&
"Unfinished parameter store.");
1894 if (!IsByVal && VTs.
size() > 0)
1897 if (IsByVal && IsVAArg)
1905 if (Ins.size() > 0) {
1912 unsigned resultsz =
DL.getTypeAllocSizeInBits(
RetTy);
1923 retAlignment = getArgumentAlignment(CB,
RetTy, 0,
DL);
1924 assert(retAlignment &&
"retAlignment is guaranteed to be set");
1927 Chain, DAG.
getConstant(retAlignment->value(), dl, MVT::i32),
1945 VADeclareParam->
getVTList(), DeclareParamOps);
1953 if (isa<ExternalSymbolSDNode>(Callee)) {
1958 assert(CalleeFunc !=
nullptr &&
"Libcall callee must be set.");
1962 CalleeFunc->
addFnAttr(
"nvptx-libcall-callee",
"true");
1975 DL,
RetTy, Args, Outs, retAlignment,
1977 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1980 *CB, UniqueCallSite);
1993 Chain, DAG.
getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
2000 Chain = DAG.
getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2005 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2011 SDValue CallArgBeginOps[] = { Chain, InGlue };
2016 for (
unsigned i = 0, e = std::min(CLI.
NumFixedArgs + 1, ParamCount); i != e;
2026 Chain = DAG.
getNode(opcode, dl, CallArgVTs, CallArgOps);
2030 SDValue CallArgEndOps[] = { Chain,
2039 Chain, DAG.
getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2056 if (Ins.size() > 0) {
2060 assert(VTs.
size() == Ins.size() &&
"Bad value decomposition");
2062 Align RetAlign = getArgumentAlignment(CB,
RetTy, 0,
DL);
2071 bool ExtendIntegerRetVal =
2072 RetTy->isIntegerTy() &&
DL.getTypeAllocSizeInBits(
RetTy) < 32;
2074 for (
unsigned i = 0, e = VTs.
size(); i != e; ++i) {
2075 bool needTruncate =
false;
2076 EVT TheLoadType = VTs[i];
2077 EVT EltType = Ins[i].VT;
2082 TheLoadType =
EVT(PromotedVT);
2083 EltType =
EVT(PromotedVT);
2084 needTruncate =
true;
2087 if (ExtendIntegerRetVal) {
2088 TheLoadType = MVT::i32;
2090 needTruncate =
true;
2092 if (VTs[i].isInteger())
2093 needTruncate =
true;
2100 EltAlign <
DL.getABITypeAlign(
2102 assert(VecIdx == -1 && LoadVTs.
empty() &&
"Orphaned operand list.");
2104 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
2106 ProxyRegTruncates.
push_back(std::optional<MVT>());
2115 assert(VecIdx == -1 && LoadVTs.
empty() &&
"Orphaned operand list.");
2122 unsigned NumElts = LoadVTs.
size();
2142 DAG.
getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2144 Op, dl, DAG.
getVTList(LoadVTs), LoadOperands, TheLoadType,
2148 for (
unsigned j = 0; j < NumElts; ++j) {
2152 ProxyRegTruncates.
push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2154 ProxyRegTruncates.
push_back(std::optional<MVT>());
2158 InGlue = RetVal.
getValue(NumElts + 1);
2168 DAG.
getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2174 for (
unsigned i = 0; i < ProxyRegOps.
size(); ++i) {
2175 if (i < RetElts.
size() && RetElts[i]) {
2182 DAG.
getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2183 { Chain, ProxyRegOps[i], InGlue }
2186 Chain = Ret.getValue(1);
2187 InGlue = Ret.getValue(2);
2189 if (ProxyRegTruncates[i]) {
2196 for (
SDValue &
T : TempProxyRegOps) {
2199 DAG.
getVTList(
T.getSimpleValueType(), MVT::Other, MVT::Glue),
2200 {Chain, T.getOperand(0), InGlue});
2222 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2223 "requires target sm_52.",
2233 uint64_t Align = cast<ConstantSDNode>(
Op.getOperand(2))->getZExtValue();
2241 EVT RetTypes[] = {ValueSizeTy, MVT::Other};
2253 unsigned NumOperands = Node->getNumOperands();
2254 for (
unsigned i = 0; i < NumOperands; ++i) {
2255 SDValue SubOp = Node->getOperand(i);
2259 for (
unsigned j = 0; j < NumSubElem; ++j) {
2273 EVT VT =
Op->getValueType(0);
2274 if (!(
Isv2x16VT(VT) || VT == MVT::v4i8))
2280 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2281 isa<ConstantFPSDNode>(Operand);
2285 if (VT == MVT::v4i8) {
2307 EVT VT =
Op->getValueType(0);
2309 return APInt(32, 0);
2311 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2312 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2313 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2319 if (VT == MVT::v4i8)
2321 return Value.zext(32);
2325 Value = GetOperand(
Op, 0) | GetOperand(
Op, 1).shl(16);
2326 }
else if (VT == MVT::v4i8) {
2327 Value = GetOperand(
Op, 0) | GetOperand(
Op, 1).shl(8) |
2328 GetOperand(
Op, 2).shl(16) | GetOperand(
Op, 3).shl(24);
2343 if (VectorVT == MVT::v4i8) {
2355 if (isa<ConstantSDNode>(
Index.getNode()))
2376 if (VectorVT != MVT::v4i8)
2380 if (
Value->isUndef())
2399 if (VectorVT != MVT::v4i8 ||
Op.getValueType() != MVT::v4i8)
2407 if (
I.value() != -1)
2408 Selector |= (
I.value() << (
I.index() * 4));
2426 EVT VT =
Op.getValueType();
2487 EVT VT =
Op.getValueType();
2538 EVT VT =
Op.getValueType();
2541 return LowerFROUND32(
Op, DAG);
2544 return LowerFROUND64(
Op, DAG);
2560 EVT VT =
Op.getValueType();
2566 const int SignBitMask = 0x80000000;
2569 const int PointFiveInBits = 0x3F000000;
2570 SDValue PointFiveWithSignRaw =
2601 EVT VT =
Op.getValueType();
2633 if (
Op.getValueType() == MVT::bf16) {
2637 DAG.
getNode(
Op.getOpcode(), Loc, MVT::f32,
Op.getOperand(0)),
2649 if (
Op.getOperand(0).getValueType() == MVT::bf16) {
2652 Op.getOpcode(), Loc,
Op.getValueType(),
2662 EVT NarrowVT =
Op.getValueType();
2699 EVT WideVT =
Op.getValueType();
2726 if (
Op.getValueType() != MVT::v2i16)
2728 EVT EltVT =
Op.getValueType().getVectorElementType();
2730 for (
int I = 0, E =
Op.getValueType().getVectorNumElements();
I < E;
I++) {
2733 [&](
const SDUse &O) {
2734 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2735 O.get(), DAG.getIntPtrConstant(I, DL));
2746 switch (
Op.getOpcode()) {
2756 return LowerBUILD_VECTOR(
Op, DAG);
2760 return LowerEXTRACT_VECTOR_ELT(
Op, DAG);
2762 return LowerINSERT_VECTOR_ELT(
Op, DAG);
2764 return LowerVECTOR_SHUFFLE(
Op, DAG);
2766 return LowerCONCAT_VECTORS(
Op, DAG);
2768 return LowerSTORE(
Op, DAG);
2770 return LowerLOAD(
Op, DAG);
2772 return LowerShiftLeftParts(
Op, DAG);
2775 return LowerShiftRightParts(
Op, DAG);
2777 return LowerSelect(
Op, DAG);
2779 return LowerFROUND(
Op, DAG);
2782 return LowerINT_TO_FP(
Op, DAG);
2785 return LowerFP_TO_INT(
Op, DAG);
2787 return LowerFP_ROUND(
Op, DAG);
2789 return LowerFP_EXTEND(
Op, DAG);
2791 return LowerBR_JT(
Op, DAG);
2793 return LowerVAARG(
Op, DAG);
2795 return LowerVASTART(
Op, DAG);
2811 return LowerCopyToReg_128(
Op, DAG);
2820 const auto *JT = cast<JumpTableSDNode>(
Op.getOperand(1));
2823 unsigned JId = JT->getIndex();
2859 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2860 EVT VT = Node->getValueType(0);
2862 SDValue Tmp1 = Node->getOperand(0);
2863 SDValue Tmp2 = Node->getOperand(1);
2864 const MaybeAlign MA(Node->getConstantOperandVal(3));
2902 SDValue Arg = getParamSymbol(DAG, -1, PtrVT);
2905 const Value *SV = cast<SrcValueSDNode>(
Op.getOperand(2))->getValue();
2906 return DAG.
getStore(
Op.getOperand(0),
DL, VAReg,
Op.getOperand(1),
2916 assert(
Op.getValueType() == MVT::i1 &&
"Custom lowering enabled only for i1");
2927 if (
Op.getValueType() == MVT::i1)
2928 return LowerLOADi1(
Op, DAG);
2932 EVT VT =
Op.getValueType();
2935 EVT MemVT =
Load->getMemoryVT();
2937 MemVT, *
Load->getMemOperand())) {
2957 "Custom lowering for i1 load only");
2959 LD->getBasePtr(),
LD->getPointerInfo(),
2960 MVT::i8,
LD->getAlign(),
2961 LD->getMemOperand()->getFlags());
2966 SDValue Ops[] = { result,
LD->getChain() };
2975 return LowerSTOREi1(
Op, DAG);
2979 if ((
Isv2x16VT(VT) || VT == MVT::v4i8) &&
2981 VT, *
Store->getMemOperand()))
2989 return LowerSTOREVector(
Op, DAG);
3037 if (Alignment < PrefAlign) {
3046 unsigned Opcode = 0;
3053 bool NeedExt =
false;
3057 bool StoreF16x2 =
false;
3085 for (
unsigned i = 0; i < NumElts; ++i) {
3096 for (
unsigned i = 0; i < NumElts; ++i) {
3106 Ops.
append(
N->op_begin() + 2,
N->op_end());
3133 DAG.
getTruncStore(Tmp1, dl, Tmp3, Tmp2,
ST->getPointerInfo(), MVT::i8,
3134 ST->getAlign(),
ST->getMemOperand()->getFlags());
3143 assert(
Op.getOperand(1).getValueType() == MVT::i128 &&
3144 "Custom lowering for 128-bit CopyToReg only");
3158 NewOps[0] =
Op->getOperand(0);
3159 NewOps[1] =
Op->getOperand(1);
3163 NewOps[4] =
Op->getOperand(3);
3168unsigned NVPTXTargetLowering::getNumRegisters(
3170 std::optional<MVT> RegisterVT = std::nullopt)
const {
3171 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3176bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3178 unsigned NumParts,
MVT PartVT, std::optional<CallingConv::ID>
CC)
const {
3179 if (Val.
getValueType() == MVT::i128 && NumParts == 1) {
3210 std::vector<SDValue> OutChains;
3213 assert(isABI &&
"Non-ABI compilation is not supported");
3217 std::vector<Type *> argTypes;
3218 std::vector<const Argument *> theArgs;
3220 theArgs.push_back(&
I);
3221 argTypes.push_back(
I.getType());
3232 unsigned InsIdx = 0;
3234 for (
unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3235 Type *Ty = argTypes[i];
3237 if (theArgs[i]->use_empty()) {
3243 if (vtparts.
empty())
3246 for (
unsigned parti = 0, parte = vtparts.
size(); parti != parte;
3251 if (vtparts.
size() > 0)
3258 for (
unsigned parti = 0; parti < NumRegs; ++parti) {
3275 bool aggregateIsPacked =
false;
3276 if (
StructType *STy = dyn_cast<StructType>(Ty))
3277 aggregateIsPacked = STy->isPacked();
3289 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3291 for (
unsigned parti = 0, parte = VTs.
size(); parti != parte; ++parti) {
3293 assert(VecIdx == -1 &&
"Orphaned vector.");
3298 if (VectorInfo[parti] &
PVF_LAST) {
3299 unsigned NumElts = parti - VecIdx + 1;
3300 EVT EltVT = VTs[parti];
3303 if (EltVT == MVT::i1)
3305 else if (
Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3319 if (aggregateIsPacked)
3322 return std::nullopt;
3332 P.getNode()->setIROrder(i + 1);
3333 for (
unsigned j = 0; j < NumElts; ++j) {
3337 if (EltVT == MVT::i1)
3340 else if (EltVT != LoadVT)
3352 Ins[InsIdx].VT.getFixedSizeInBits() >
3356 Elt = DAG.
getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3379 assert(ObjectVT == Ins[InsIdx].VT &&
3380 "Ins type did not match function type");
3381 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3384 p.getNode()->setIROrder(i + 1);
3388 if (!OutChains.empty())
3404 for (
unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3414 DAG.
getVTList(MVT::Other), StoreOperands,
3432 assert(isABI &&
"Non-ABI compilation is not supported");
3441 assert(VTs.
size() == OutVals.
size() &&
"Bad return value decomposition");
3443 for (
unsigned i = 0, e = VTs.
size(); i != e; ++i) {
3444 SDValue PromotedOutVal = OutVals[i];
3447 VTs[i] =
EVT(PromotedVT);
3452 PromotedOutVal = DAG.
getNode(Ext, dl, PromotedVT, PromotedOutVal);
3454 PromotedOutVals.
push_back(PromotedOutVal);
3465 bool ExtendIntegerRetVal =
3466 RetTy->isIntegerTy() &&
DL.getTypeAllocSizeInBits(
RetTy) < 32;
3469 for (
unsigned i = 0, e = VTs.
size(); i != e; ++i) {
3471 SDValue RetVal = PromotedOutVals[i];
3473 if (ExtendIntegerRetVal) {
3476 dl, MVT::i32, RetVal);
3486 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3487 Align ElementTypeAlign =
3488 DL.getABITypeAlign(ElementType.getTypeForEVT(
RetTy->getContext()));
3489 Align ElementAlign =
3491 if (ElementAlign < ElementTypeAlign) {
3492 assert(StoreOperands.
empty() &&
"Orphaned operand list.");
3504 assert(StoreOperands.
empty() &&
"Orphaned operand list.");
3515 unsigned NumElts = StoreOperands.
size() - 2;
3532 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3534 Op, dl, DAG.
getVTList(MVT::Other), StoreOperands, TheStoreType,
3537 StoreOperands.
clear();
3547 if (Constraint.
size() > 1)
3553 switch (Intrinsic) {
3557 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3559 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3561 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3563 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3565 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3567 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3569 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3571 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3573 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3575 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3577 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3579 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3582 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3584 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3586 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3588 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3590 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3592 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3594 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3596 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3598 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3600 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3602 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3604 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3607 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3609 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3611 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3613 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3615 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3617 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3619 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3621 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3623 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3625 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3627 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3629 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3632 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3634 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3636 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3638 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3640 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3642 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3644 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3646 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3648 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3650 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3652 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3654 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3657 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3659 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3661 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3663 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3665 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3667 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3669 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3671 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3673 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3675 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3677 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3679 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3682 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3684 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3686 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3688 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3690 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3692 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3695 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3697 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3699 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3701 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3703 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3705 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3708 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3710 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3712 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3714 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3716 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3718 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3720 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3722 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3724 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3726 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3728 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3730 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3733 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3735 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3737 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3739 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3741 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3743 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3745 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3747 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3749 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3751 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3753 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3755 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3758 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3760 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3762 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3764 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3766 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3768 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3770 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3772 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3774 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3776 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3778 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3780 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3783 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3785 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3787 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3789 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3791 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3793 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3795 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3797 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3799 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3801 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3803 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3805 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3808 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3810 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3812 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3814 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3816 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3818 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3820 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3822 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3824 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3826 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3828 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3830 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3833 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3835 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3837 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3839 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3841 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3843 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3845 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3847 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3849 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3851 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3853 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3855 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3858 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3860 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3862 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3864 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3866 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3868 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3871 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3873 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3875 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3877 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3879 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3881 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3884 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3886 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3888 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3890 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3892 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3894 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3897 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3899 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3901 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3903 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3905 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3907 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3909 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3911 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3913 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3915 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3917 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3919 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3925 switch (Intrinsic) {
3928 case Intrinsic::nvvm_suld_1d_i8_clamp:
3930 case Intrinsic::nvvm_suld_1d_i16_clamp:
3932 case Intrinsic::nvvm_suld_1d_i32_clamp:
3934 case Intrinsic::nvvm_suld_1d_i64_clamp:
3936 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3938 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3940 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3942 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3944 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3946 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3948 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3950 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3952 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3954 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3956 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3958 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3960 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3962 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3964 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3966 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3968 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3970 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3972 case Intrinsic::nvvm_suld_2d_i8_clamp:
3974 case Intrinsic::nvvm_suld_2d_i16_clamp:
3976 case Intrinsic::nvvm_suld_2d_i32_clamp:
3978 case Intrinsic::nvvm_suld_2d_i64_clamp:
3980 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3982 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3984 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3986 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3988 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3990 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3992 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3994 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3996 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3998 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4000 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4002 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4004 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4006 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4008 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4010 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4012 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4014 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4016 case Intrinsic::nvvm_suld_3d_i8_clamp:
4018 case Intrinsic::nvvm_suld_3d_i16_clamp:
4020 case Intrinsic::nvvm_suld_3d_i32_clamp:
4022 case Intrinsic::nvvm_suld_3d_i64_clamp:
4024 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4026 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4028 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4030 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4032 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4034 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4036 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4038 case Intrinsic::nvvm_suld_1d_i8_trap:
4040 case Intrinsic::nvvm_suld_1d_i16_trap:
4042 case Intrinsic::nvvm_suld_1d_i32_trap:
4044 case Intrinsic::nvvm_suld_1d_i64_trap:
4046 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4048 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4050 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4052 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4054 case Intrinsic::nvvm_suld_1d_v4i8_trap: