LLVM 17.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
33#include "llvm/IR/Argument.h"
34#include "llvm/IR/Attributes.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
38#include "llvm/IR/FPEnv.h"
39#include "llvm/IR/Function.h"
40#include "llvm/IR/GlobalValue.h"
41#include "llvm/IR/Instruction.h"
43#include "llvm/IR/IntrinsicsNVPTX.h"
44#include "llvm/IR/Module.h"
45#include "llvm/IR/Type.h"
46#include "llvm/IR/Value.h"
55#include <algorithm>
56#include <cassert>
57#include <cmath>
58#include <cstdint>
59#include <iterator>
60#include <sstream>
61#include <string>
62#include <utility>
63#include <vector>
64
65#define DEBUG_TYPE "nvptx-lower"
66
67using namespace llvm;
68
69static std::atomic<unsigned> GlobalUniqueCallSite;
70
72 "nvptx-sched4reg",
73 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
74
76 "nvptx-fma-level", cl::Hidden,
77 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
78 " 1: do it 2: do it aggressively"),
79 cl::init(2));
80
82 "nvptx-prec-divf32", cl::Hidden,
83 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
84 " IEEE Compliant F32 div.rnd if available."),
85 cl::init(2));
86
88 "nvptx-prec-sqrtf32", cl::Hidden,
89 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
90 cl::init(true));
91
93 if (UsePrecDivF32.getNumOccurrences() > 0) {
94 // If nvptx-prec-div32=N is used on the command-line, always honor it
95 return UsePrecDivF32;
96 } else {
97 // Otherwise, use div.approx if fast math is enabled
98 if (getTargetMachine().Options.UnsafeFPMath)
99 return 0;
100 else
101 return 2;
102 }
103}
104
106 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
107 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
108 return UsePrecSqrtF32;
109 } else {
110 // Otherwise, use sqrt.approx if fast math is enabled
112 }
113}
114
118}
119
120static bool IsPTXVectorType(MVT VT) {
121 switch (VT.SimpleTy) {
122 default:
123 return false;
124 case MVT::v2i1:
125 case MVT::v4i1:
126 case MVT::v2i8:
127 case MVT::v4i8:
128 case MVT::v2i16:
129 case MVT::v4i16:
130 case MVT::v2i32:
131 case MVT::v4i32:
132 case MVT::v2i64:
133 case MVT::v2f16:
134 case MVT::v4f16:
135 case MVT::v8f16: // <4 x f16x2>
136 case MVT::v2bf16:
137 case MVT::v4bf16:
138 case MVT::v8bf16: // <4 x bf16x2>
139 case MVT::v2f32:
140 case MVT::v4f32:
141 case MVT::v2f64:
142 return true;
143 }
144}
145
146/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
147/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
148/// into their primitive components.
149/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
150/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
151/// LowerCall, and LowerReturn.
152static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
153 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
154 SmallVectorImpl<uint64_t> *Offsets = nullptr,
155 uint64_t StartingOffset = 0) {
156 SmallVector<EVT, 16> TempVTs;
157 SmallVector<uint64_t, 16> TempOffsets;
158
159 // Special case for i128 - decompose to (i64, i64)
160 if (Ty->isIntegerTy(128)) {
161 ValueVTs.push_back(EVT(MVT::i64));
162 ValueVTs.push_back(EVT(MVT::i64));
163
164 if (Offsets) {
165 Offsets->push_back(StartingOffset + 0);
166 Offsets->push_back(StartingOffset + 8);
167 }
168
169 return;
170 }
171
172 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
173 if (StructType *STy = dyn_cast<StructType>(Ty)) {
174 auto const *SL = DL.getStructLayout(STy);
175 auto ElementNum = 0;
176 for(auto *EI : STy->elements()) {
177 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
178 StartingOffset + SL->getElementOffset(ElementNum));
179 ++ElementNum;
180 }
181 return;
182 }
183
184 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
185 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
186 EVT VT = TempVTs[i];
187 uint64_t Off = TempOffsets[i];
188 // Split vectors into individual elements, except for v2f16, which
189 // we will pass as a single scalar.
190 if (VT.isVector()) {
191 unsigned NumElts = VT.getVectorNumElements();
192 EVT EltVT = VT.getVectorElementType();
193 // Vectors with an even number of f16 elements will be passed to
194 // us as an array of v2f16/v2bf16 elements. We must match this so we
195 // stay in sync with Ins/Outs.
196 if ((EltVT == MVT::f16 || EltVT == MVT::bf16) && NumElts % 2 == 0) {
197 EltVT = EltVT == MVT::f16 ? MVT::v2f16 : MVT::v2bf16;
198 NumElts /= 2;
199 }
200 for (unsigned j = 0; j != NumElts; ++j) {
201 ValueVTs.push_back(EltVT);
202 if (Offsets)
203 Offsets->push_back(Off + j * EltVT.getStoreSize());
204 }
205 } else {
206 ValueVTs.push_back(VT);
207 if (Offsets)
208 Offsets->push_back(Off);
209 }
210 }
211}
212
213/// PromoteScalarIntegerPTX
214/// Used to make sure the arguments/returns are suitable for passing
215/// and promote them to a larger size if they're not.
216///
217/// The promoted type is placed in \p PromoteVT if the function returns true.
218static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
219 if (VT.isScalarInteger()) {
220 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
221 default:
223 "Promotion is not suitable for scalars of size larger than 64-bits");
224 case 1:
225 *PromotedVT = MVT::i1;
226 break;
227 case 2:
228 case 4:
229 case 8:
230 *PromotedVT = MVT::i8;
231 break;
232 case 16:
233 *PromotedVT = MVT::i16;
234 break;
235 case 32:
236 *PromotedVT = MVT::i32;
237 break;
238 case 64:
239 *PromotedVT = MVT::i64;
240 break;
241 }
242 return EVT(*PromotedVT) != VT;
243 }
244 return false;
245}
246
247// Check whether we can merge loads/stores of some of the pieces of a
248// flattened function parameter or return value into a single vector
249// load/store.
250//
251// The flattened parameter is represented as a list of EVTs and
252// offsets, and the whole structure is aligned to ParamAlignment. This
253// function determines whether we can load/store pieces of the
254// parameter starting at index Idx using a single vectorized op of
255// size AccessSize. If so, it returns the number of param pieces
256// covered by the vector op. Otherwise, it returns 1.
258 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
259 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
260
261 // Can't vectorize if param alignment is not sufficient.
262 if (ParamAlignment < AccessSize)
263 return 1;
264 // Can't vectorize if offset is not aligned.
265 if (Offsets[Idx] & (AccessSize - 1))
266 return 1;
267
268 EVT EltVT = ValueVTs[Idx];
269 unsigned EltSize = EltVT.getStoreSize();
270
271 // Element is too large to vectorize.
272 if (EltSize >= AccessSize)
273 return 1;
274
275 unsigned NumElts = AccessSize / EltSize;
276 // Can't vectorize if AccessBytes if not a multiple of EltSize.
277 if (AccessSize != EltSize * NumElts)
278 return 1;
279
280 // We don't have enough elements to vectorize.
281 if (Idx + NumElts > ValueVTs.size())
282 return 1;
283
284 // PTX ISA can only deal with 2- and 4-element vector ops.
285 if (NumElts != 4 && NumElts != 2)
286 return 1;
287
288 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
289 // Types do not match.
290 if (ValueVTs[j] != EltVT)
291 return 1;
292
293 // Elements are not contiguous.
294 if (Offsets[j] - Offsets[j - 1] != EltSize)
295 return 1;
296 }
297 // OK. We can vectorize ValueVTs[i..i+NumElts)
298 return NumElts;
299}
300
301// Flags for tracking per-element vectorization state of loads/stores
302// of a flattened function parameter or return value.
304 PVF_INNER = 0x0, // Middle elements of a vector.
305 PVF_FIRST = 0x1, // First element of the vector.
306 PVF_LAST = 0x2, // Last element of the vector.
307 // Scalar is effectively a 1-element vector.
310
311// Computes whether and how we can vectorize the loads/stores of a
312// flattened function parameter or return value.
313//
314// The flattened parameter is represented as the list of ValueVTs and
315// Offsets, and is aligned to ParamAlignment bytes. We return a vector
316// of the same size as ValueVTs indicating how each piece should be
317// loaded/stored (i.e. as a scalar, or as part of a vector
318// load/store).
321 const SmallVectorImpl<uint64_t> &Offsets,
322 Align ParamAlignment, bool IsVAArg = false) {
323 // Set vector size to match ValueVTs and mark all elements as
324 // scalars by default.
326 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
327
328 if (IsVAArg)
329 return VectorInfo;
330
331 // Check what we can vectorize using 128/64/32-bit accesses.
332 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
333 // Skip elements we've already processed.
334 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
335 for (unsigned AccessSize : {16, 8, 4, 2}) {
336 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
337 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
338 // Mark vectorized elements.
339 switch (NumElts) {
340 default:
341 llvm_unreachable("Unexpected return value");
342 case 1:
343 // Can't vectorize using this size, try next smaller size.
344 continue;
345 case 2:
346 assert(I + 1 < E && "Not enough elements.");
347 VectorInfo[I] = PVF_FIRST;
348 VectorInfo[I + 1] = PVF_LAST;
349 I += 1;
350 break;
351 case 4:
352 assert(I + 3 < E && "Not enough elements.");
353 VectorInfo[I] = PVF_FIRST;
354 VectorInfo[I + 1] = PVF_INNER;
355 VectorInfo[I + 2] = PVF_INNER;
356 VectorInfo[I + 3] = PVF_LAST;
357 I += 3;
358 break;
359 }
360 // Break out of the inner loop because we've already succeeded
361 // using largest possible AccessSize.
362 break;
363 }
364 }
365 return VectorInfo;
366}
367
368// NVPTXTargetLowering Constructor.
370 const NVPTXSubtarget &STI)
371 : TargetLowering(TM), nvTM(&TM), STI(STI) {
372 // always lower memset, memcpy, and memmove intrinsics to load/store
373 // instructions, rather
374 // then generating calls to memset, mempcy or memmove.
375 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
376 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
377 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
378
381
382 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
383 // condition branches.
384 setJumpIsExpensive(true);
385
386 // Wide divides are _very_ slow. Try to reduce the width of the divide if
387 // possible.
388 addBypassSlowDiv(64, 32);
389
390 // By default, use the Source scheduling
391 if (sched4reg)
393 else
395
396 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
397 LegalizeAction NoF16Action) {
398 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
399 };
400
401 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
402 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
403 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
404 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
405 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
406 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
407 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
408 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
409 addRegisterClass(MVT::bf16, &NVPTX::Float16RegsRegClass);
410 addRegisterClass(MVT::v2bf16, &NVPTX::Float16x2RegsRegClass);
411
412 // Conversion to/from FP16/FP16x2 is always legal.
419
420 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
421 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
422
423 // Operations not directly supported by NVPTX.
428 }
429
430 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
431 // For others we will expand to a SHL/SRA pair.
437
444
447
448 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
449 // that don't have h/w rotation we lower them to multi-instruction assembly.
450 // See ROT*_sw in NVPTXIntrInfo.td
455
463
464 // Indirect branch is not supported.
465 // This also disables Jump Table creation.
468
471
472 // We want to legalize constant related memmove and memcopy
473 // intrinsics.
475
476 // Turn FP extload into load/fpextend
486 // Turn FP truncstore into trunc + store.
487 // FIXME: vector types should also be expanded
491
492 // PTX does not support load / store predicate registers
495
496 for (MVT VT : MVT::integer_valuetypes()) {
500 }
501
502 // This is legal in NVPTX
507
508 // TRAP can be lowered to PTX trap
510
511 // Register custom handling for vector loads/stores
513 if (IsPTXVectorType(VT)) {
517 }
518 }
519
520 // Support varargs.
525
526 // Custom handling for i8 intrinsics
528
529 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
535
538 }
539
544 if (STI.getPTXVersion() >= 43) {
549 }
550
554
555 // PTX does not directly support SELP of i1, so promote to i32 first
557
558 // PTX cannot multiply two i64s in a single instruction.
561
562 // We have some custom DAG combine patterns for these nodes
565
566 // setcc for f16x2 needs special handling to prevent legalizer's
567 // attempt to scalarize it due to v2i1 not being legal.
568 if (STI.allowFP16Math())
570
571 // Promote fp16 arithmetic if fp16 hardware isn't available or the
572 // user passed --nvptx-no-fp16-math. The flag is useful because,
573 // although sm_53+ GPUs have some sort of FP16 support in
574 // hardware, only sm_53 and sm_60 have full implementation. Others
575 // only have token amount of hardware and are likely to run faster
576 // by using fp32 units instead.
577 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
578 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
579 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
580 }
581
582 // f16/f16x2 neg was introduced in PTX 60, SM_53.
583 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
584 STI.getPTXVersion() >= 60 &&
585 STI.allowFP16Math();
586 for (const auto &VT : {MVT::f16, MVT::v2f16})
588 IsFP16FP16x2NegAvailable ? Legal : Expand);
589
590 // (would be) Library functions.
591
592 // These map to conversion instructions for scalar FP types.
593 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
599 }
600
605
606
607 // 'Expand' implements FCOPYSIGN without calling an external library.
612
613 // These map to corresponding instructions for f32/f64. f16 must be
614 // promoted to f32. v2f16 is expanded to f16, which is then promoted
615 // to f32.
616 for (const auto &Op :
622 }
623 // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
624 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
625 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
626 return IsAtLeastSm80 ? Legal : NotSm80Action;
627 };
628 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
629 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
632 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
633 }
634 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
635 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
636 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
637 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
638 }
639
640 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
641 // No FPOW or FREM in PTX.
642
643 // Now deduce the information based on the above mentioned
644 // actions
646
648}
649
650const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
651 switch ((NVPTXISD::NodeType)Opcode) {
653 break;
654 case NVPTXISD::CALL:
655 return "NVPTXISD::CALL";
657 return "NVPTXISD::RET_FLAG";
659 return "NVPTXISD::LOAD_PARAM";
661 return "NVPTXISD::Wrapper";
663 return "NVPTXISD::DeclareParam";
665 return "NVPTXISD::DeclareScalarParam";
667 return "NVPTXISD::DeclareRet";
669 return "NVPTXISD::DeclareScalarRet";
671 return "NVPTXISD::DeclareRetParam";
673 return "NVPTXISD::PrintCall";
675 return "NVPTXISD::PrintConvergentCall";
677 return "NVPTXISD::PrintCallUni";
679 return "NVPTXISD::PrintConvergentCallUni";
681 return "NVPTXISD::LoadParam";
683 return "NVPTXISD::LoadParamV2";
685 return "NVPTXISD::LoadParamV4";
687 return "NVPTXISD::StoreParam";
689 return "NVPTXISD::StoreParamV2";
691 return "NVPTXISD::StoreParamV4";
693 return "NVPTXISD::StoreParamS32";
695 return "NVPTXISD::StoreParamU32";
697 return "NVPTXISD::CallArgBegin";
699 return "NVPTXISD::CallArg";
701 return "NVPTXISD::LastCallArg";
703 return "NVPTXISD::CallArgEnd";
705 return "NVPTXISD::CallVoid";
707 return "NVPTXISD::CallVal";
709 return "NVPTXISD::CallSymbol";
711 return "NVPTXISD::Prototype";
713 return "NVPTXISD::MoveParam";
715 return "NVPTXISD::StoreRetval";
717 return "NVPTXISD::StoreRetvalV2";
719 return "NVPTXISD::StoreRetvalV4";
721 return "NVPTXISD::PseudoUseParam";
722 case NVPTXISD::RETURN:
723 return "NVPTXISD::RETURN";
725 return "NVPTXISD::CallSeqBegin";
727 return "NVPTXISD::CallSeqEnd";
729 return "NVPTXISD::CallPrototype";
731 return "NVPTXISD::ProxyReg";
732 case NVPTXISD::LoadV2:
733 return "NVPTXISD::LoadV2";
734 case NVPTXISD::LoadV4:
735 return "NVPTXISD::LoadV4";
736 case NVPTXISD::LDGV2:
737 return "NVPTXISD::LDGV2";
738 case NVPTXISD::LDGV4:
739 return "NVPTXISD::LDGV4";
740 case NVPTXISD::LDUV2:
741 return "NVPTXISD::LDUV2";
742 case NVPTXISD::LDUV4:
743 return "NVPTXISD::LDUV4";
745 return "NVPTXISD::StoreV2";
747 return "NVPTXISD::StoreV4";
749 return "NVPTXISD::FUN_SHFL_CLAMP";
751 return "NVPTXISD::FUN_SHFR_CLAMP";
752 case NVPTXISD::IMAD:
753 return "NVPTXISD::IMAD";
755 return "NVPTXISD::SETP_F16X2";
756 case NVPTXISD::Dummy:
757 return "NVPTXISD::Dummy";
759 return "NVPTXISD::MUL_WIDE_SIGNED";
761 return "NVPTXISD::MUL_WIDE_UNSIGNED";
762 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
763 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
765 return "NVPTXISD::Tex1DFloatFloatLevel";
767 return "NVPTXISD::Tex1DFloatFloatGrad";
768 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
769 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
771 return "NVPTXISD::Tex1DS32FloatLevel";
773 return "NVPTXISD::Tex1DS32FloatGrad";
774 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
775 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
777 return "NVPTXISD::Tex1DU32FloatLevel";
779 return "NVPTXISD::Tex1DU32FloatGrad";
780 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
781 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
783 return "NVPTXISD::Tex1DArrayFloatFloatLevel";
785 return "NVPTXISD::Tex1DArrayFloatFloatGrad";
786 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
787 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
789 return "NVPTXISD::Tex1DArrayS32FloatLevel";
791 return "NVPTXISD::Tex1DArrayS32FloatGrad";
792 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
793 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
795 return "NVPTXISD::Tex1DArrayU32FloatLevel";
797 return "NVPTXISD::Tex1DArrayU32FloatGrad";
798 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
799 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
801 return "NVPTXISD::Tex2DFloatFloatLevel";
803 return "NVPTXISD::Tex2DFloatFloatGrad";
804 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
805 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
807 return "NVPTXISD::Tex2DS32FloatLevel";
809 return "NVPTXISD::Tex2DS32FloatGrad";
810 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
811 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
813 return "NVPTXISD::Tex2DU32FloatLevel";
815 return "NVPTXISD::Tex2DU32FloatGrad";
816 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
817 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
819 return "NVPTXISD::Tex2DArrayFloatFloatLevel";
821 return "NVPTXISD::Tex2DArrayFloatFloatGrad";
822 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
823 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
825 return "NVPTXISD::Tex2DArrayS32FloatLevel";
827 return "NVPTXISD::Tex2DArrayS32FloatGrad";
828 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
829 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
831 return "NVPTXISD::Tex2DArrayU32FloatLevel";
833 return "NVPTXISD::Tex2DArrayU32FloatGrad";
834 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
835 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
837 return "NVPTXISD::Tex3DFloatFloatLevel";
839 return "NVPTXISD::Tex3DFloatFloatGrad";
840 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
841 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
843 return "NVPTXISD::Tex3DS32FloatLevel";
845 return "NVPTXISD::Tex3DS32FloatGrad";
846 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
847 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
849 return "NVPTXISD::Tex3DU32FloatLevel";
851 return "NVPTXISD::Tex3DU32FloatGrad";
852 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
854 return "NVPTXISD::TexCubeFloatFloatLevel";
855 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
857 return "NVPTXISD::TexCubeS32FloatLevel";
858 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
860 return "NVPTXISD::TexCubeU32FloatLevel";
862 return "NVPTXISD::TexCubeArrayFloatFloat";
864 return "NVPTXISD::TexCubeArrayFloatFloatLevel";
866 return "NVPTXISD::TexCubeArrayS32Float";
868 return "NVPTXISD::TexCubeArrayS32FloatLevel";
870 return "NVPTXISD::TexCubeArrayU32Float";
872 return "NVPTXISD::TexCubeArrayU32FloatLevel";
874 return "NVPTXISD::Tld4R2DFloatFloat";
876 return "NVPTXISD::Tld4G2DFloatFloat";
878 return "NVPTXISD::Tld4B2DFloatFloat";
880 return "NVPTXISD::Tld4A2DFloatFloat";
882 return "NVPTXISD::Tld4R2DS64Float";
884 return "NVPTXISD::Tld4G2DS64Float";
886 return "NVPTXISD::Tld4B2DS64Float";
888 return "NVPTXISD::Tld4A2DS64Float";
890 return "NVPTXISD::Tld4R2DU64Float";
892 return "NVPTXISD::Tld4G2DU64Float";
894 return "NVPTXISD::Tld4B2DU64Float";
896 return "NVPTXISD::Tld4A2DU64Float";
897
899 return "NVPTXISD::TexUnified1DFloatS32";
901 return "NVPTXISD::TexUnified1DFloatFloat";
903 return "NVPTXISD::TexUnified1DFloatFloatLevel";
905 return "NVPTXISD::TexUnified1DFloatFloatGrad";
907 return "NVPTXISD::TexUnified1DS32S32";
909 return "NVPTXISD::TexUnified1DS32Float";
911 return "NVPTXISD::TexUnified1DS32FloatLevel";
913 return "NVPTXISD::TexUnified1DS32FloatGrad";
915 return "NVPTXISD::TexUnified1DU32S32";
917 return "NVPTXISD::TexUnified1DU32Float";
919 return "NVPTXISD::TexUnified1DU32FloatLevel";
921 return "NVPTXISD::TexUnified1DU32FloatGrad";
923 return "NVPTXISD::TexUnified1DArrayFloatS32";
925 return "NVPTXISD::TexUnified1DArrayFloatFloat";
927 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
929 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
931 return "NVPTXISD::TexUnified1DArrayS32S32";
933 return "NVPTXISD::TexUnified1DArrayS32Float";
935 return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
937 return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
939 return "NVPTXISD::TexUnified1DArrayU32S32";
941 return "NVPTXISD::TexUnified1DArrayU32Float";
943 return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
945 return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
947 return "NVPTXISD::TexUnified2DFloatS32";
949 return "NVPTXISD::TexUnified2DFloatFloat";
951 return "NVPTXISD::TexUnified2DFloatFloatLevel";
953 return "NVPTXISD::TexUnified2DFloatFloatGrad";
955 return "NVPTXISD::TexUnified2DS32S32";
957 return "NVPTXISD::TexUnified2DS32Float";
959 return "NVPTXISD::TexUnified2DS32FloatLevel";
961 return "NVPTXISD::TexUnified2DS32FloatGrad";
963 return "NVPTXISD::TexUnified2DU32S32";
965 return "NVPTXISD::TexUnified2DU32Float";
967 return "NVPTXISD::TexUnified2DU32FloatLevel";
969 return "NVPTXISD::TexUnified2DU32FloatGrad";
971 return "NVPTXISD::TexUnified2DArrayFloatS32";
973 return "NVPTXISD::TexUnified2DArrayFloatFloat";
975 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
977 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
979 return "NVPTXISD::TexUnified2DArrayS32S32";
981 return "NVPTXISD::TexUnified2DArrayS32Float";
983 return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
985 return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
987 return "NVPTXISD::TexUnified2DArrayU32S32";
989 return "NVPTXISD::TexUnified2DArrayU32Float";
991 return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
993 return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
995 return "NVPTXISD::TexUnified3DFloatS32";
997 return "NVPTXISD::TexUnified3DFloatFloat";
999 return "NVPTXISD::TexUnified3DFloatFloatLevel";
1001 return "NVPTXISD::TexUnified3DFloatFloatGrad";
1003 return "NVPTXISD::TexUnified3DS32S32";
1005 return "NVPTXISD::TexUnified3DS32Float";
1007 return "NVPTXISD::TexUnified3DS32FloatLevel";
1009 return "NVPTXISD::TexUnified3DS32FloatGrad";
1011 return "NVPTXISD::TexUnified3DU32S32";
1013 return "NVPTXISD::TexUnified3DU32Float";
1015 return "NVPTXISD::TexUnified3DU32FloatLevel";
1017 return "NVPTXISD::TexUnified3DU32FloatGrad";
1019 return "NVPTXISD::TexUnifiedCubeFloatFloat";
1021 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
1023 return "NVPTXISD::TexUnifiedCubeS32Float";
1025 return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
1027 return "NVPTXISD::TexUnifiedCubeU32Float";
1029 return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
1031 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
1033 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
1035 return "NVPTXISD::TexUnifiedCubeArrayS32Float";
1037 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
1039 return "NVPTXISD::TexUnifiedCubeArrayU32Float";
1041 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
1043 return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
1045 return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
1047 return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
1049 return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
1051 return "NVPTXISD::Tld4UnifiedR2DS64Float";
1053 return "NVPTXISD::Tld4UnifiedG2DS64Float";
1055 return "NVPTXISD::Tld4UnifiedB2DS64Float";
1057 return "NVPTXISD::Tld4UnifiedA2DS64Float";
1059 return "NVPTXISD::Tld4UnifiedR2DU64Float";
1061 return "NVPTXISD::Tld4UnifiedG2DU64Float";
1063 return "NVPTXISD::Tld4UnifiedB2DU64Float";
1065 return "NVPTXISD::Tld4UnifiedA2DU64Float";
1066
1067 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1068 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1069 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1070 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1071 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1072 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1073 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1074 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1075 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1076 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1077 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1078
1079 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1080 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1081 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1082 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1083 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1084 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1085 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1086 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1087 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1088 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1089 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1090
1091 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1092 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1093 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1094 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1095 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1096 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1097 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1098 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1099 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1100 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1101 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1102
1103 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1104 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1105 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1106 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1107 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1108 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1109 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1110 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1111 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1112 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1113 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1114
1115 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1116 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1117 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1118 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1119 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1120 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1121 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1122 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1123 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1124 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1125 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1126
1127 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1128 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1129 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1130 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1131 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1132 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1133 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1134 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1135 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1136 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1137 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1138
1139 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1140 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1141 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1142 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1143 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1144 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1145 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1146 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1147 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1148 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1149 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1150
1151 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1152 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1153 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1154 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1155 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1156 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1157 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1158 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1159 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1160 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1161 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1162
1163 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1164 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1165 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1166 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1167 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1168 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1169 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1170 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1171 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1172 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1173 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1174
1175 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1176 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1177 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1178 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1179 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1180 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1181 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1182 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1183 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1184 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1185 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1186
1187 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1188 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1189 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1190 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1191 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1192 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1193 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1194 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1195 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1196 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1197 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1198
1199 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1200 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1201 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1202 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1203 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1204 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1205 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1206 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1207 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1208 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1209 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1210
1211 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1212 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1213 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1214 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1215 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1216 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1217 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1218 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1219 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1220 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1221 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1222
1223 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1224 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1225 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1226 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1227 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1228 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1229 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1230 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1231 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1232 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1233 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1234
1235 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1236 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1237 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1238 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1239 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1240 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1241 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1242 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1243 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1244 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1245 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1246 }
1247 return nullptr;
1248}
1249
1252 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1253 VT.getScalarType() == MVT::i1)
1254 return TypeSplitVector;
1255 if (VT == MVT::v2f16)
1256 return TypeLegal;
1258}
1259
1261 int Enabled, int &ExtraSteps,
1262 bool &UseOneConst,
1263 bool Reciprocal) const {
1266 return SDValue();
1267
1268 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1269 ExtraSteps = 0;
1270
1271 SDLoc DL(Operand);
1272 EVT VT = Operand.getValueType();
1273 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1274
1275 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1276 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1277 DAG.getConstant(IID, DL, MVT::i32), Operand);
1278 };
1279
1280 // The sqrt and rsqrt refinement processes assume we always start out with an
1281 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1282 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1283 // any refinement, we must return a regular sqrt.
1284 if (Reciprocal || ExtraSteps > 0) {
1285 if (VT == MVT::f32)
1286 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1287 : Intrinsic::nvvm_rsqrt_approx_f);
1288 else if (VT == MVT::f64)
1289 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1290 else
1291 return SDValue();
1292 } else {
1293 if (VT == MVT::f32)
1294 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1295 : Intrinsic::nvvm_sqrt_approx_f);
1296 else {
1297 // There's no sqrt.approx.f64 instruction, so we emit
1298 // reciprocal(rsqrt(x)). This is faster than
1299 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1300 // x * rsqrt(x).)
1301 return DAG.getNode(
1303 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1304 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1305 }
1306 }
1307}
1308
1309SDValue
1311 SDLoc dl(Op);
1312 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1313 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1314 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1315 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1316}
1317
1319 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1320 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1321 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1322 const CallBase &CB, unsigned UniqueCallSite) const {
1323 auto PtrVT = getPointerTy(DL);
1324
1325 bool isABI = (STI.getSmVersion() >= 20);
1326 assert(isABI && "Non-ABI compilation is not supported");
1327 if (!isABI)
1328 return "";
1329
1330 std::string Prototype;
1331 raw_string_ostream O(Prototype);
1332 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1333
1334 if (retTy->getTypeID() == Type::VoidTyID) {
1335 O << "()";
1336 } else {
1337 O << "(";
1338 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1339 unsigned size = 0;
1340 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1341 size = ITy->getBitWidth();
1342 } else {
1343 assert(retTy->isFloatingPointTy() &&
1344 "Floating point type expected here");
1345 size = retTy->getPrimitiveSizeInBits();
1346 }
1347 // PTX ABI requires all scalar return values to be at least 32
1348 // bits in size. fp16 normally uses .b16 as its storage type in
1349 // PTX, so its size must be adjusted here, too.
1351
1352 O << ".param .b" << size << " _";
1353 } else if (isa<PointerType>(retTy)) {
1354 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1355 } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
1356 retTy->isIntegerTy(128)) {
1357 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1358 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1359 } else {
1360 llvm_unreachable("Unknown return type");
1361 }
1362 O << ") ";
1363 }
1364 O << "_ (";
1365
1366 bool first = true;
1367
1368 const Function *F = CB.getFunction();
1369 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1370 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1371 Type *Ty = Args[i].Ty;
1372 if (!first) {
1373 O << ", ";
1374 }
1375 first = false;
1376
1377 if (!Outs[OIdx].Flags.isByVal()) {
1378 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1379 unsigned ParamAlign = 0;
1380 const CallInst *CallI = cast<CallInst>(&CB);
1381 // +1 because index 0 is reserved for return type alignment
1382 if (!getAlign(*CallI, i + 1, ParamAlign))
1383 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1384 O << ".param .align " << ParamAlign << " .b8 ";
1385 O << "_";
1386 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1387 // update the index for Outs
1388 SmallVector<EVT, 16> vtparts;
1389 ComputeValueVTs(*this, DL, Ty, vtparts);
1390 if (unsigned len = vtparts.size())
1391 OIdx += len - 1;
1392 continue;
1393 }
1394 // i8 types in IR will be i16 types in SDAG
1395 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1396 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1397 "type mismatch between callee prototype and arguments");
1398 // scalar type
1399 unsigned sz = 0;
1400 if (isa<IntegerType>(Ty)) {
1401 sz = cast<IntegerType>(Ty)->getBitWidth();
1403 } else if (isa<PointerType>(Ty)) {
1404 sz = PtrVT.getSizeInBits();
1405 } else if (Ty->isHalfTy())
1406 // PTX ABI requires all scalar parameters to be at least 32
1407 // bits in size. fp16 normally uses .b16 as its storage type
1408 // in PTX, so its size must be adjusted here, too.
1409 sz = 32;
1410 else
1411 sz = Ty->getPrimitiveSizeInBits();
1412 O << ".param .b" << sz << " ";
1413 O << "_";
1414 continue;
1415 }
1416
1417 Type *ETy = Args[i].IndirectType;
1418 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1419 Align ParamByValAlign =
1420 getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1421
1422 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1423 O << "_";
1424 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1425 }
1426
1427 if (VAInfo)
1428 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1429 << " .b8 _[]\n";
1430 O << ")";
1432 O << " .noreturn";
1433 O << ";";
1434
1435 return Prototype;
1436}
1437
1438Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1439 const CallBase *CB, Type *Ty,
1440 unsigned Idx,
1441 const DataLayout &DL) const {
1442 if (!CB) {
1443 // CallSite is zero, fallback to ABI type alignment
1444 return DL.getABITypeAlign(Ty);
1445 }
1446
1447 unsigned Alignment = 0;
1448 const Function *DirectCallee = CB->getCalledFunction();
1449
1450 if (!DirectCallee) {
1451 // We don't have a direct function symbol, but that may be because of
1452 // constant cast instructions in the call.
1453
1454 // With bitcast'd call targets, the instruction will be the call
1455 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1456 // Check if we have call alignment metadata
1457 if (getAlign(*CI, Idx, Alignment))
1458 return Align(Alignment);
1459 }
1460 DirectCallee = getMaybeBitcastedCallee(CB);
1461 }
1462
1463 // Check for function alignment information if we found that the
1464 // ultimate target is a Function
1465 if (DirectCallee) {
1466 if (getAlign(*DirectCallee, Idx, Alignment))
1467 return Align(Alignment);
1468 // If alignment information is not available, fall back to the
1469 // default function param optimized type alignment
1470 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1471 }
1472
1473 // Call is indirect, fall back to the ABI type alignment
1474 return DL.getABITypeAlign(Ty);
1475}
1476
1478 SmallVectorImpl<SDValue> &InVals) const {
1479
1480 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1482 "Support for variadic functions (unsized array parameter) introduced "
1483 "in PTX ISA version 6.0 and requires target sm_30.");
1484
1485 SelectionDAG &DAG = CLI.DAG;
1486 SDLoc dl = CLI.DL;
1488 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1490 SDValue Chain = CLI.Chain;
1491 SDValue Callee = CLI.Callee;
1492 bool &isTailCall = CLI.IsTailCall;
1493 ArgListTy &Args = CLI.getArgs();
1494 Type *RetTy = CLI.RetTy;
1495 const CallBase *CB = CLI.CB;
1496 const DataLayout &DL = DAG.getDataLayout();
1497
1498 bool isABI = (STI.getSmVersion() >= 20);
1499 assert(isABI && "Non-ABI compilation is not supported");
1500 if (!isABI)
1501 return Chain;
1502
1503 // Variadic arguments.
1504 //
1505 // Normally, for each argument, we declare a param scalar or a param
1506 // byte array in the .param space, and store the argument value to that
1507 // param scalar or array starting at offset 0.
1508 //
1509 // In the case of the first variadic argument, we declare a vararg byte array
1510 // with size 0. The exact size of this array isn't known at this point, so
1511 // it'll be patched later. All the variadic arguments will be stored to this
1512 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1513 // initially set to 0, so it can be used for non-variadic arguments (which use
1514 // 0 offset) to simplify the code.
1515 //
1516 // After all vararg is processed, 'VAOffset' holds the size of the
1517 // vararg byte array.
1518
1519 SDValue VADeclareParam; // vararg byte array
1520 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1521 unsigned VAOffset = 0; // current offset in the param array
1522
1523 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1524 SDValue TempChain = Chain;
1525 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1526 SDValue InFlag = Chain.getValue(1);
1527
1528 unsigned ParamCount = 0;
1529 // Args.size() and Outs.size() need not match.
1530 // Outs.size() will be larger
1531 // * if there is an aggregate argument with multiple fields (each field
1532 // showing up separately in Outs)
1533 // * if there is a vector argument with more than typical vector-length
1534 // elements (generally if more than 4) where each vector element is
1535 // individually present in Outs.
1536 // So a different index should be used for indexing into Outs/OutVals.
1537 // See similar issue in LowerFormalArguments.
1538 unsigned OIdx = 0;
1539 // Declare the .params or .reg need to pass values
1540 // to the function
1541 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1542 EVT VT = Outs[OIdx].VT;
1543 Type *Ty = Args[i].Ty;
1544 bool IsVAArg = (i >= CLI.NumFixedArgs);
1545 bool IsByVal = Outs[OIdx].Flags.isByVal();
1546
1549
1550 assert((!IsByVal || Args[i].IndirectType) &&
1551 "byval arg must have indirect type");
1552 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1553 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1554
1555 Align ArgAlign;
1556 if (IsByVal) {
1557 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1558 // so we don't need to worry whether it's naturally aligned or not.
1559 // See TargetLowering::LowerCallTo().
1560 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1561 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1562 InitialAlign, DL);
1563 if (IsVAArg)
1564 VAOffset = alignTo(VAOffset, ArgAlign);
1565 } else {
1566 ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
1567 }
1568
1569 unsigned TypeSize =
1570 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1571 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1572
1573 bool NeedAlign; // Does argument declaration specify alignment?
1574 if (IsVAArg) {
1575 if (ParamCount == FirstVAArg) {
1576 SDValue DeclareParamOps[] = {
1577 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1578 DAG.getConstant(ParamCount, dl, MVT::i32),
1579 DAG.getConstant(1, dl, MVT::i32), InFlag};
1580 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1581 DeclareParamVTs, DeclareParamOps);
1582 }
1583 NeedAlign = IsByVal || Ty->isAggregateType() || Ty->isVectorTy() ||
1584 Ty->isIntegerTy(128);
1585 } else if (IsByVal || Ty->isAggregateType() || Ty->isVectorTy() ||
1586 Ty->isIntegerTy(128)) {
1587 // declare .param .align <align> .b8 .param<n>[<size>];
1588 SDValue DeclareParamOps[] = {
1589 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1590 DAG.getConstant(ParamCount, dl, MVT::i32),
1591 DAG.getConstant(TypeSize, dl, MVT::i32), InFlag};
1592 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1593 DeclareParamOps);
1594 NeedAlign = true;
1595 } else {
1596 // declare .param .b<size> .param<n>;
1597 if (VT.isInteger() || VT.isFloatingPoint()) {
1598 // PTX ABI requires integral types to be at least 32 bits in
1599 // size. FP16 is loaded/stored using i16, so it's handled
1600 // here as well.
1602 }
1603 SDValue DeclareScalarParamOps[] = {
1604 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1605 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1606 DAG.getConstant(0, dl, MVT::i32), InFlag};
1607 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1608 DeclareScalarParamOps);
1609 NeedAlign = false;
1610 }
1611 InFlag = Chain.getValue(1);
1612
1613 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1614 // than 32-bits are sign extended or zero extended, depending on
1615 // whether they are signed or unsigned types. This case applies
1616 // only to scalar parameters and not to aggregate values.
1617 bool ExtendIntegerParam =
1618 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1619
1620 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1621 SmallVector<SDValue, 6> StoreOperands;
1622 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1623 EVT EltVT = VTs[j];
1624 int CurOffset = Offsets[j];
1625 MaybeAlign PartAlign;
1626 if (NeedAlign)
1627 PartAlign = commonAlignment(ArgAlign, CurOffset);
1628
1629 // New store.
1630 if (VectorInfo[j] & PVF_FIRST) {
1631 assert(StoreOperands.empty() && "Unfinished preceding store.");
1632 StoreOperands.push_back(Chain);
1633 StoreOperands.push_back(
1634 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1635 StoreOperands.push_back(DAG.getConstant(
1636 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1637 dl, MVT::i32));
1638 }
1639
1640 SDValue StVal = OutVals[OIdx];
1641
1642 MVT PromotedVT;
1643 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1644 EltVT = EVT(PromotedVT);
1645 }
1646 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1648 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1649 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1650 }
1651
1652 if (IsByVal) {
1653 auto PtrVT = getPointerTy(DL);
1654 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1655 DAG.getConstant(CurOffset, dl, PtrVT));
1656 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1657 PartAlign);
1658 } else if (ExtendIntegerParam) {
1659 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1660 // zext/sext to i32
1661 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1663 dl, MVT::i32, StVal);
1664 }
1665
1666 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1667 // Use 16-bit registers for small stores as it's the
1668 // smallest general purpose register size supported by NVPTX.
1669 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1670 }
1671
1672 // Record the value to store.
1673 StoreOperands.push_back(StVal);
1674
1675 if (VectorInfo[j] & PVF_LAST) {
1676 unsigned NumElts = StoreOperands.size() - 3;
1678 switch (NumElts) {
1679 case 1:
1681 break;
1682 case 2:
1684 break;
1685 case 4:
1687 break;
1688 default:
1689 llvm_unreachable("Invalid vector info.");
1690 }
1691
1692 StoreOperands.push_back(InFlag);
1693
1694 // Adjust type of the store op if we've extended the scalar
1695 // return value.
1696 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1697
1698 Chain = DAG.getMemIntrinsicNode(
1699 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1700 TheStoreType, MachinePointerInfo(), PartAlign,
1702 InFlag = Chain.getValue(1);
1703
1704 // Cleanup.
1705 StoreOperands.clear();
1706
1707 // TODO: We may need to support vector types that can be passed
1708 // as scalars in variadic arguments.
1709 if (!IsByVal && IsVAArg) {
1710 assert(NumElts == 1 &&
1711 "Vectorization is expected to be disabled for variadics.");
1712 VAOffset += DL.getTypeAllocSize(
1713 TheStoreType.getTypeForEVT(*DAG.getContext()));
1714 }
1715 }
1716 if (!IsByVal)
1717 ++OIdx;
1718 }
1719 assert(StoreOperands.empty() && "Unfinished parameter store.");
1720 if (!IsByVal && VTs.size() > 0)
1721 --OIdx;
1722 ++ParamCount;
1723 if (IsByVal && IsVAArg)
1724 VAOffset += TypeSize;
1725 }
1726
1727 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1728 MaybeAlign retAlignment = std::nullopt;
1729
1730 // Handle Result
1731 if (Ins.size() > 0) {
1732 SmallVector<EVT, 16> resvtparts;
1733 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1734
1735 // Declare
1736 // .param .align 16 .b8 retval0[<size-in-bytes>], or
1737 // .param .b<size-in-bits> retval0
1738 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1739 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1740 // these three types to match the logic in
1741 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1742 // Plus, this behavior is consistent with nvcc's.
1743 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1744 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1745 resultsz = promoteScalarArgumentSize(resultsz);
1746 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1747 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1748 DAG.getConstant(resultsz, dl, MVT::i32),
1749 DAG.getConstant(0, dl, MVT::i32), InFlag };
1750 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1751 DeclareRetOps);
1752 InFlag = Chain.getValue(1);
1753 } else {
1754 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1755 assert(retAlignment && "retAlignment is guaranteed to be set");
1756 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1757 SDValue DeclareRetOps[] = {
1758 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1759 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1760 DAG.getConstant(0, dl, MVT::i32), InFlag};
1761 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1762 DeclareRetOps);
1763 InFlag = Chain.getValue(1);
1764 }
1765 }
1766
1767 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1768 // Set the size of the vararg param byte array if the callee is a variadic
1769 // function and the variadic part is not empty.
1770 if (HasVAArgs) {
1771 SDValue DeclareParamOps[] = {
1772 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1773 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1774 VADeclareParam.getOperand(4)};
1775 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1776 VADeclareParam->getVTList(), DeclareParamOps);
1777 }
1778
1779 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1780 // between them we must rely on the call site value which is valid for
1781 // indirect calls but is always null for libcalls.
1782 bool isIndirectCall = !Func && CB;
1783
1784 if (isa<ExternalSymbolSDNode>(Callee)) {
1785 Function* CalleeFunc = nullptr;
1786
1787 // Try to find the callee in the current module.
1788 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1789 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1790
1791 // Set the "libcall callee" attribute to indicate that the function
1792 // must always have a declaration.
1793 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1794 }
1795
1796 if (isIndirectCall) {
1797 // This is indirect function call case : PTX requires a prototype of the
1798 // form
1799 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1800 // to be emitted, and the label has to used as the last arg of call
1801 // instruction.
1802 // The prototype is embedded in a string and put as the operand for a
1803 // CallPrototype SDNode which will print out to the value of the string.
1804 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1805 std::string Proto = getPrototype(
1806 DL, RetTy, Args, Outs, retAlignment,
1807 HasVAArgs
1808 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1809 CLI.NumFixedArgs,
1810 cast<ConstantSDNode>(VADeclareParam->getOperand(1))
1811 ->getAPIntValue()))
1812 : std::nullopt,
1813 *CB, UniqueCallSite);
1814 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1815 SDValue ProtoOps[] = {
1816 Chain,
1817 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1818 InFlag,
1819 };
1820 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1821 InFlag = Chain.getValue(1);
1822 }
1823 // Op to just print "call"
1824 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1825 SDValue PrintCallOps[] = {
1826 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1827 };
1828 // We model convergent calls as separate opcodes.
1830 if (CLI.IsConvergent)
1833 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1834 InFlag = Chain.getValue(1);
1835
1836 // Ops to print out the function name
1837 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1838 SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1839 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1840 InFlag = Chain.getValue(1);
1841
1842 // Ops to print out the param list
1843 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1844 SDValue CallArgBeginOps[] = { Chain, InFlag };
1845 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1846 CallArgBeginOps);
1847 InFlag = Chain.getValue(1);
1848
1849 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
1850 ++i) {
1851 unsigned opcode;
1852 if (i == (e - 1))
1853 opcode = NVPTXISD::LastCallArg;
1854 else
1855 opcode = NVPTXISD::CallArg;
1856 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1857 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1858 DAG.getConstant(i, dl, MVT::i32), InFlag };
1859 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1860 InFlag = Chain.getValue(1);
1861 }
1862 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1863 SDValue CallArgEndOps[] = { Chain,
1864 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1865 InFlag };
1866 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1867 InFlag = Chain.getValue(1);
1868
1869 if (isIndirectCall) {
1870 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1871 SDValue PrototypeOps[] = {
1872 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
1873 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1874 InFlag = Chain.getValue(1);
1875 }
1876
1877 SmallVector<SDValue, 16> ProxyRegOps;
1878 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
1879
1880 // Generate loads from param memory/moves from registers for result
1881 if (Ins.size() > 0) {
1884 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1885 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1886
1887 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1888 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1889
1890 SmallVector<EVT, 6> LoadVTs;
1891 int VecIdx = -1; // Index of the first element of the vector.
1892
1893 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1894 // 32-bits are sign extended or zero extended, depending on whether
1895 // they are signed or unsigned types.
1896 bool ExtendIntegerRetVal =
1897 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1898
1899 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1900 bool needTruncate = false;
1901 EVT TheLoadType = VTs[i];
1902 EVT EltType = Ins[i].VT;
1903 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
1904 MVT PromotedVT;
1905
1906 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
1907 TheLoadType = EVT(PromotedVT);
1908 EltType = EVT(PromotedVT);
1909 needTruncate = true;
1910 }
1911
1912 if (ExtendIntegerRetVal) {
1913 TheLoadType = MVT::i32;
1914 EltType = MVT::i32;
1915 needTruncate = true;
1916 } else if (TheLoadType.getSizeInBits() < 16) {
1917 if (VTs[i].isInteger())
1918 needTruncate = true;
1919 EltType = MVT::i16;
1920 }
1921
1922 // Record index of the very first element of the vector.
1923 if (VectorInfo[i] & PVF_FIRST) {
1924 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1925 VecIdx = i;
1926 }
1927
1928 LoadVTs.push_back(EltType);
1929
1930 if (VectorInfo[i] & PVF_LAST) {
1931 unsigned NumElts = LoadVTs.size();
1932 LoadVTs.push_back(MVT::Other);
1933 LoadVTs.push_back(MVT::Glue);
1935 switch (NumElts) {
1936 case 1:
1938 break;
1939 case 2:
1941 break;
1942 case 4:
1944 break;
1945 default:
1946 llvm_unreachable("Invalid vector info.");
1947 }
1948
1949 SDValue LoadOperands[] = {
1950 Chain, DAG.getConstant(1, dl, MVT::i32),
1951 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1952 SDValue RetVal = DAG.getMemIntrinsicNode(
1953 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1954 MachinePointerInfo(), EltAlign,
1956
1957 for (unsigned j = 0; j < NumElts; ++j) {
1958 ProxyRegOps.push_back(RetVal.getValue(j));
1959
1960 if (needTruncate)
1961 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
1962 else
1963 ProxyRegTruncates.push_back(std::optional<MVT>());
1964 }
1965
1966 Chain = RetVal.getValue(NumElts);
1967 InFlag = RetVal.getValue(NumElts + 1);
1968
1969 // Cleanup
1970 VecIdx = -1;
1971 LoadVTs.clear();
1972 }
1973 }
1974 }
1975
1976 Chain =
1977 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InFlag, dl);
1978 InFlag = Chain.getValue(1);
1979
1980 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1981 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1982 // dangling.
1983 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1984 SDValue Ret = DAG.getNode(
1986 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1987 { Chain, ProxyRegOps[i], InFlag }
1988 );
1989
1990 Chain = Ret.getValue(1);
1991 InFlag = Ret.getValue(2);
1992
1993 if (ProxyRegTruncates[i]) {
1994 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
1995 }
1996
1997 InVals.push_back(Ret);
1998 }
1999
2000 // set isTailCall to false for now, until we figure out how to express
2001 // tail call optimization in PTX
2002 isTailCall = false;
2003 return Chain;
2004}
2005
2006// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2007// (see LegalizeDAG.cpp). This is slow and uses local memory.
2008// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2009SDValue
2010NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2011 SDNode *Node = Op.getNode();
2012 SDLoc dl(Node);
2014 unsigned NumOperands = Node->getNumOperands();
2015 for (unsigned i = 0; i < NumOperands; ++i) {
2016 SDValue SubOp = Node->getOperand(i);
2017 EVT VVT = SubOp.getNode()->getValueType(0);
2018 EVT EltVT = VVT.getVectorElementType();
2019 unsigned NumSubElem = VVT.getVectorNumElements();
2020 for (unsigned j = 0; j < NumSubElem; ++j) {
2021 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2022 DAG.getIntPtrConstant(j, dl)));
2023 }
2024 }
2025 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2026}
2027
2028// We can init constant f16x2 with a single .b32 move. Normally it
2029// would get lowered as two constant loads and vector-packing move.
2030// mov.b16 %h1, 0x4000;
2031// mov.b16 %h2, 0x3C00;
2032// mov.b32 %hh2, {%h2, %h1};
2033// Instead we want just a constant move:
2034// mov.b32 %hh2, 0x40003C00
2035//
2036// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
2037// generates good SASS in both cases.
2038SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2039 SelectionDAG &DAG) const {
2040 if (!(Op->getValueType(0) == MVT::v2f16 &&
2041 isa<ConstantFPSDNode>(Op->getOperand(0)) &&
2042 isa<ConstantFPSDNode>(Op->getOperand(1))))
2043 return Op;
2044
2045 APInt E0 =
2046 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
2047 APInt E1 =
2048 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
2049 SDValue Const =
2050 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
2051 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
2052}
2053
2054SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2055 SelectionDAG &DAG) const {
2056 SDValue Index = Op->getOperand(1);
2057 // Constant index will be matched by tablegen.
2058 if (isa<ConstantSDNode>(Index.getNode()))
2059 return Op;
2060
2061 // Extract individual elements and select one of them.
2062 SDValue Vector = Op->getOperand(0);
2063 EVT VectorVT = Vector.getValueType();
2064 assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
2065 EVT EltVT = VectorVT.getVectorElementType();
2066
2067 SDLoc dl(Op.getNode());
2068 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2069 DAG.getIntPtrConstant(0, dl));
2070 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2071 DAG.getIntPtrConstant(1, dl));
2072 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2074}
2075
2076/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2077/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2078/// amount, or
2079/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2080/// amount.
2081SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2082 SelectionDAG &DAG) const {
2083 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2084 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2085
2086 EVT VT = Op.getValueType();
2087 unsigned VTBits = VT.getSizeInBits();
2088 SDLoc dl(Op);
2089 SDValue ShOpLo = Op.getOperand(0);
2090 SDValue ShOpHi = Op.getOperand(1);
2091 SDValue ShAmt = Op.getOperand(2);
2092 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2093
2094 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2095 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2096 // {dHi, dLo} = {aHi, aLo} >> Amt
2097 // dHi = aHi >> Amt
2098 // dLo = shf.r.clamp aLo, aHi, Amt
2099
2100 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2101 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2102 ShAmt);
2103
2104 SDValue Ops[2] = { Lo, Hi };
2105 return DAG.getMergeValues(Ops, dl);
2106 }
2107 else {
2108 // {dHi, dLo} = {aHi, aLo} >> Amt
2109 // - if (Amt>=size) then
2110 // dLo = aHi >> (Amt-size)
2111 // dHi = aHi >> Amt (this is either all 0 or all 1)
2112 // else
2113 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2114 // dHi = aHi >> Amt
2115
2116 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2117 DAG.getConstant(VTBits, dl, MVT::i32),
2118 ShAmt);
2119 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2120 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2121 DAG.getConstant(VTBits, dl, MVT::i32));
2122 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2123 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2124 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2125
2126 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2127 DAG.getConstant(VTBits, dl, MVT::i32),
2128 ISD::SETGE);
2129 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2130 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2131
2132 SDValue Ops[2] = { Lo, Hi };
2133 return DAG.getMergeValues(Ops, dl);
2134 }
2135}
2136
2137/// LowerShiftLeftParts - Lower SHL_PARTS, which
2138/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2139/// amount, or
2140/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2141/// amount.
2142SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2143 SelectionDAG &DAG) const {
2144 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2145 assert(Op.getOpcode() == ISD::SHL_PARTS);
2146
2147 EVT VT = Op.getValueType();
2148 unsigned VTBits = VT.getSizeInBits();
2149 SDLoc dl(Op);
2150 SDValue ShOpLo = Op.getOperand(0);
2151 SDValue ShOpHi = Op.getOperand(1);
2152 SDValue ShAmt = Op.getOperand(2);
2153
2154 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2155 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2156 // {dHi, dLo} = {aHi, aLo} << Amt
2157 // dHi = shf.l.clamp aLo, aHi, Amt
2158 // dLo = aLo << Amt
2159
2160 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2161 ShAmt);
2162 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2163
2164 SDValue Ops[2] = { Lo, Hi };
2165 return DAG.getMergeValues(Ops, dl);
2166 }
2167 else {
2168 // {dHi, dLo} = {aHi, aLo} << Amt
2169 // - if (Amt>=size) then
2170 // dLo = aLo << Amt (all 0)
2171 // dLo = aLo << (Amt-size)
2172 // else
2173 // dLo = aLo << Amt
2174 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2175
2176 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2177 DAG.getConstant(VTBits, dl, MVT::i32),
2178 ShAmt);
2179 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2180 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2181 DAG.getConstant(VTBits, dl, MVT::i32));
2182 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2183 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2184 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2185
2186 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2187 DAG.getConstant(VTBits, dl, MVT::i32),
2188 ISD::SETGE);
2189 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2190 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2191
2192 SDValue Ops[2] = { Lo, Hi };
2193 return DAG.getMergeValues(Ops, dl);
2194 }
2195}
2196
2197SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2198 EVT VT = Op.getValueType();
2199
2200 if (VT == MVT::f32)
2201 return LowerFROUND32(Op, DAG);
2202
2203 if (VT == MVT::f64)
2204 return LowerFROUND64(Op, DAG);
2205
2206 llvm_unreachable("unhandled type");
2207}
2208
2209// This is the the rounding method used in CUDA libdevice in C like code:
2210// float roundf(float A)
2211// {
2212// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2213// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2214// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2215// }
2216SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2217 SelectionDAG &DAG) const {
2218 SDLoc SL(Op);
2219 SDValue A = Op.getOperand(0);
2220 EVT VT = Op.getValueType();
2221
2222 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2223
2224 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2226 const int SignBitMask = 0x80000000;
2227 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2228 DAG.getConstant(SignBitMask, SL, MVT::i32));
2229 const int PointFiveInBits = 0x3F000000;
2230 SDValue PointFiveWithSignRaw =
2231 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2232 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2233 SDValue PointFiveWithSign =
2234 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2235 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2236 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2237
2238 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2239 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2240 SDValue IsLarge =
2241 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2242 ISD::SETOGT);
2243 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2244
2245 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2246 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2247 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2248 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2249 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2250}
2251
2252// The implementation of round(double) is similar to that of round(float) in
2253// that they both separate the value range into three regions and use a method
2254// specific to the region to round the values. However, round(double) first
2255// calculates the round of the absolute value and then adds the sign back while
2256// round(float) directly rounds the value with sign.
2257SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2258 SelectionDAG &DAG) const {
2259 SDLoc SL(Op);
2260 SDValue A = Op.getOperand(0);
2261 EVT VT = Op.getValueType();
2262
2263 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2264
2265 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2266 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2267 DAG.getConstantFP(0.5, SL, VT));
2268 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2269
2270 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2271 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2272 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2273 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2274 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2275 DAG.getConstantFP(0, SL, VT),
2276 RoundedA);
2277
2278 // Add sign to rounded_A
2279 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2280 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2281
2282 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2283 SDValue IsLarge =
2284 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2285 ISD::SETOGT);
2286 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2287}
2288
2289
2290
2291SDValue
2293 switch (Op.getOpcode()) {
2294 case ISD::RETURNADDR:
2295 return SDValue();
2296 case ISD::FRAMEADDR:
2297 return SDValue();
2298 case ISD::GlobalAddress:
2299 return LowerGlobalAddress(Op, DAG);
2301 return Op;
2302 case ISD::BUILD_VECTOR:
2303 return LowerBUILD_VECTOR(Op, DAG);
2305 return Op;
2307 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2309 return LowerCONCAT_VECTORS(Op, DAG);
2310 case ISD::STORE:
2311 return LowerSTORE(Op, DAG);
2312 case ISD::LOAD:
2313 return LowerLOAD(Op, DAG);
2314 case ISD::SHL_PARTS:
2315 return LowerShiftLeftParts(Op, DAG);
2316 case ISD::SRA_PARTS:
2317 case ISD::SRL_PARTS:
2318 return LowerShiftRightParts(Op, DAG);
2319 case ISD::SELECT:
2320 return LowerSelect(Op, DAG);
2321 case ISD::FROUND:
2322 return LowerFROUND(Op, DAG);
2323 case ISD::VAARG:
2324 return LowerVAARG(Op, DAG);
2325 case ISD::VASTART:
2326 return LowerVASTART(Op, DAG);
2327 default:
2328 llvm_unreachable("Custom lowering not defined for operation");
2329 }
2330}
2331
2332// This function is almost a copy of SelectionDAG::expandVAArg().
2333// The only diff is that this one produces loads from local address space.
2334SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2335 const TargetLowering *TLI = STI.getTargetLowering();
2336 SDLoc DL(Op);
2337
2338 SDNode *Node = Op.getNode();
2339 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2340 EVT VT = Node->getValueType(0);
2341 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2342 SDValue Tmp1 = Node->getOperand(0);
2343 SDValue Tmp2 = Node->getOperand(1);
2344 const MaybeAlign MA(Node->getConstantOperandVal(3));
2345
2346 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2347 Tmp1, Tmp2, MachinePointerInfo(V));
2348 SDValue VAList = VAListLoad;
2349
2350 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2351 VAList = DAG.getNode(
2352 ISD::ADD, DL, VAList.getValueType(), VAList,
2353 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2354
2355 VAList = DAG.getNode(
2356 ISD::AND, DL, VAList.getValueType(), VAList,
2357 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2358 }
2359
2360 // Increment the pointer, VAList, to the next vaarg
2361 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2363 DL, VAList.getValueType()));
2364
2365 // Store the incremented VAList to the legalized pointer
2366 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2368
2369 const Value *SrcV =
2371
2372 // Load the actual argument out of the pointer VAList
2373 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2374}
2375
2376SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2377 const TargetLowering *TLI = STI.getTargetLowering();
2378 SDLoc DL(Op);
2379 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2380
2381 // Store the address of unsized array <function>_vararg[] in the ap object.
2382 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2383 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2384
2385 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2386 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2387 MachinePointerInfo(SV));
2388}
2389
2390SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2391 SDValue Op0 = Op->getOperand(0);
2392 SDValue Op1 = Op->getOperand(1);
2393 SDValue Op2 = Op->getOperand(2);
2394 SDLoc DL(Op.getNode());
2395
2396 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2397
2398 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2399 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2400 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2401 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2402
2403 return Trunc;
2404}
2405
2406SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2407 if (Op.getValueType() == MVT::i1)
2408 return LowerLOADi1(Op, DAG);
2409
2410 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2411 // loads and have to handle it here.
2412 if (Op.getValueType() == MVT::v2f16) {
2413 LoadSDNode *Load = cast<LoadSDNode>(Op);
2414 EVT MemVT = Load->getMemoryVT();
2416 MemVT, *Load->getMemOperand())) {
2417 SDValue Ops[2];
2418 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2419 return DAG.getMergeValues(Ops, SDLoc(Op));
2420 }
2421 }
2422
2423 return SDValue();
2424}
2425
2426// v = ld i1* addr
2427// =>
2428// v1 = ld i8* addr (-> i16)
2429// v = trunc i16 to i1
2430SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2431 SDNode *Node = Op.getNode();
2432 LoadSDNode *LD = cast<LoadSDNode>(Node);
2433 SDLoc dl(Node);
2434 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2435 assert(Node->getValueType(0) == MVT::i1 &&
2436 "Custom lowering for i1 load only");
2437 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2438 LD->getPointerInfo(), LD->getAlign(),
2439 LD->getMemOperand()->getFlags());
2440 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2441 // The legalizer (the caller) is expecting two values from the legalized
2442 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2443 // in LegalizeDAG.cpp which also uses MergeValues.
2444 SDValue Ops[] = { result, LD->getChain() };
2445 return DAG.getMergeValues(Ops, dl);
2446}
2447
2448SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2449 StoreSDNode *Store = cast<StoreSDNode>(Op);
2450 EVT VT = Store->getMemoryVT();
2451
2452 if (VT == MVT::i1)
2453 return LowerSTOREi1(Op, DAG);
2454
2455 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2456 // stores and have to handle it here.
2457 if (VT == MVT::v2f16 &&
2459 VT, *Store->getMemOperand()))
2460 return expandUnalignedStore(Store, DAG);
2461
2462 if (VT.isVector())
2463 return LowerSTOREVector(Op, DAG);
2464
2465 return SDValue();
2466}
2467
2468SDValue
2469NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2470 SDNode *N = Op.getNode();
2471 SDValue Val = N->getOperand(1);
2472 SDLoc DL(N);
2473 EVT ValVT = Val.getValueType();
2474
2475 if (ValVT.isVector()) {
2476 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2477 // legal. We can (and should) split that into 2 stores of <2 x double> here
2478 // but I'm leaving that as a TODO for now.
2479 if (!ValVT.isSimple())
2480 return SDValue();
2481 switch (ValVT.getSimpleVT().SimpleTy) {
2482 default:
2483 return SDValue();
2484 case MVT::v2i8:
2485 case MVT::v2i16:
2486 case MVT::v2i32:
2487 case MVT::v2i64:
2488 case MVT::v2f16:
2489 case MVT::v2bf16:
2490 case MVT::v2f32:
2491 case MVT::v2f64:
2492 case MVT::v4i8:
2493 case MVT::v4i16:
2494 case MVT::v4i32:
2495 case MVT::v4f16:
2496 case MVT::v4bf16:
2497 case MVT::v4f32:
2498 case MVT::v8f16: // <4 x f16x2>
2499 case MVT::v8bf16: // <4 x bf16x2>
2500 // This is a "native" vector type
2501 break;
2502 }
2503
2504 MemSDNode *MemSD = cast<MemSDNode>(N);
2505 const DataLayout &TD = DAG.getDataLayout();
2506
2507 Align Alignment = MemSD->getAlign();
2508 Align PrefAlign =
2509 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2510 if (Alignment < PrefAlign) {
2511 // This store is not sufficiently aligned, so bail out and let this vector
2512 // store be scalarized. Note that we may still be able to emit smaller
2513 // vector stores. For example, if we are storing a <4 x float> with an
2514 // alignment of 8, this check will fail but the legalizer will try again
2515 // with 2 x <2 x float>, which will succeed with an alignment of 8.
2516 return SDValue();
2517 }
2518
2519 unsigned Opcode = 0;
2520 EVT EltVT = ValVT.getVectorElementType();
2521 unsigned NumElts = ValVT.getVectorNumElements();
2522
2523 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2524 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2525 // stored type to i16 and propagate the "real" type as the memory type.
2526 bool NeedExt = false;
2527 if (EltVT.getSizeInBits() < 16)
2528 NeedExt = true;
2529
2530 bool StoreF16x2 = false;
2531 switch (NumElts) {
2532 default:
2533 return SDValue();
2534 case 2:
2535 Opcode = NVPTXISD::StoreV2;
2536 break;
2537 case 4:
2538 Opcode = NVPTXISD::StoreV4;
2539 break;
2540 case 8:
2541 // v8f16 is a special case. PTX doesn't have st.v8.f16
2542 // instruction. Instead, we split the vector into v2f16 chunks and
2543 // store them with st.v4.b32.
2544 assert((EltVT == MVT::f16 || EltVT == MVT::bf16) &&
2545 "Wrong type for the vector.");
2546 Opcode = NVPTXISD::StoreV4;
2547 StoreF16x2 = true;
2548 break;
2549 }
2550
2552
2553 // First is the chain
2554 Ops.push_back(N->getOperand(0));
2555
2556 if (StoreF16x2) {
2557 // Combine f16,f16 -> v2f16
2558 NumElts /= 2;
2559 for (unsigned i = 0; i < NumElts; ++i) {
2561 DAG.getIntPtrConstant(i * 2, DL));
2563 DAG.getIntPtrConstant(i * 2 + 1, DL));
2565 Ops.push_back(V2);
2566 }
2567 } else {
2568 // Then the split values
2569 for (unsigned i = 0; i < NumElts; ++i) {
2570 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2571 DAG.getIntPtrConstant(i, DL));
2572 if (NeedExt)
2573 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2574 Ops.push_back(ExtVal);
2575 }
2576 }
2577
2578 // Then any remaining arguments
2579 Ops.append(N->op_begin() + 2, N->op_end());
2580
2581 SDValue NewSt =
2582 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2583 MemSD->getMemoryVT(), MemSD->getMemOperand());
2584
2585 // return DCI.CombineTo(N, NewSt, true);
2586 return NewSt;
2587 }
2588
2589 return SDValue();
2590}
2591
2592// st i1 v, addr
2593// =>
2594// v1 = zxt v to i16
2595// st.u8 i16, addr
2596SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2597 SDNode *Node = Op.getNode();
2598 SDLoc dl(Node);
2599 StoreSDNode *ST = cast<StoreSDNode>(Node);
2600 SDValue Tmp1 = ST->getChain();
2601 SDValue Tmp2 = ST->getBasePtr();
2602 SDValue Tmp3 = ST->getValue();
2603 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2604 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2605 SDValue Result =
2606 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2607 ST->getAlign(), ST->getMemOperand()->getFlags());
2608 return Result;
2609}
2610
2611// This creates target external symbol for a function parameter.
2612// Name of the symbol is composed from its index and the function name.
2613// Negative index corresponds to special parameter (unsized array) used for
2614// passing variable arguments.
2615SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
2616 EVT v) const {
2617 StringRef SavedStr = nvTM->getStrPool().save(
2619 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
2620}
2621
2623 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2624 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2625 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2627 const DataLayout &DL = DAG.getDataLayout();
2628 auto PtrVT = getPointerTy(DAG.getDataLayout());
2629
2630 const Function *F = &MF.getFunction();
2631 const AttributeList &PAL = F->getAttributes();
2632 const TargetLowering *TLI = STI.getTargetLowering();
2633
2634 SDValue Root = DAG.getRoot();
2635 std::vector<SDValue> OutChains;
2636
2637 bool isABI = (STI.getSmVersion() >= 20);
2638 assert(isABI && "Non-ABI compilation is not supported");
2639 if (!isABI)
2640 return Chain;
2641
2642 std::vector<Type *> argTypes;
2643 std::vector<const Argument *> theArgs;
2644 for (const Argument &I : F->args()) {
2645 theArgs.push_back(&I);
2646 argTypes.push_back(I.getType());
2647 }
2648 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2649 // Ins.size() will be larger
2650 // * if there is an aggregate argument with multiple fields (each field
2651 // showing up separately in Ins)
2652 // * if there is a vector argument with more than typical vector-length
2653 // elements (generally if more than 4) where each vector element is
2654 // individually present in Ins.
2655 // So a different index should be used for indexing into Ins.
2656 // See similar issue in LowerCall.
2657 unsigned InsIdx = 0;
2658
2659 int idx = 0;
2660 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2661 Type *Ty = argTypes[i];
2662
2663 if (theArgs[i]->use_empty()) {
2664 // argument is dead
2665 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2666 SmallVector<EVT, 16> vtparts;
2667
2668 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2669 if (vtparts.empty())
2670 report_fatal_error("Empty parameter types are not supported");
2671
2672 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2673 ++parti) {
2674 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2675 ++InsIdx;
2676 }
2677 if (vtparts.size() > 0)
2678 --InsIdx;
2679 continue;
2680 }
2681 if (Ty->isVectorTy()) {
2682 EVT ObjectVT = getValueType(DL, Ty);
2683 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2684 for (unsigned parti = 0; parti < NumRegs; ++parti) {
2685 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2686 ++InsIdx;
2687 }
2688 if (NumRegs > 0)
2689 --InsIdx;
2690 continue;
2691 }
2692 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2693 continue;
2694 }
2695
2696 // In the following cases, assign a node order of "idx+1"
2697 // to newly created nodes. The SDNodes for params have to
2698 // appear in the same order as their order of appearance
2699 // in the original function. "idx+1" holds that order.
2700 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
2701 bool aggregateIsPacked = false;
2702 if (StructType *STy = dyn_cast<StructType>(Ty))
2703 aggregateIsPacked = STy->isPacked();
2704
2707 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2708 if (VTs.empty())
2709 report_fatal_error("Empty parameter types are not supported");
2710
2711 auto VectorInfo =
2712 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
2713
2714 SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2715 int VecIdx = -1; // Index of the first element of the current vector.
2716 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2717 if (VectorInfo[parti] & PVF_FIRST) {
2718 assert(VecIdx == -1 && "Orphaned vector.");
2719 VecIdx = parti;
2720 }
2721
2722 // That's the last element of this store op.
2723 if (VectorInfo[parti] & PVF_LAST) {
2724 unsigned NumElts = parti - VecIdx + 1;
2725 EVT EltVT = VTs[parti];
2726 // i1 is loaded/stored as i8.
2727 EVT LoadVT = EltVT;
2728 if (EltVT == MVT::i1)
2729 LoadVT = MVT::i8;
2730 else if (EltVT == MVT::v2f16)
2731 // getLoad needs a vector type, but it can't handle
2732 // vectors which contain v2f16 elements. So we must load
2733 // using i32 here and then bitcast back.
2734 LoadVT = MVT::i32;
2735
2736 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2737 SDValue VecAddr =
2738 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2739 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2741 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2742 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
2743 MachinePointerInfo(srcValue),
2744 MaybeAlign(aggregateIsPacked ? 1 : 0),
2747 if (P.getNode())
2748 P.getNode()->setIROrder(idx + 1);
2749 for (unsigned j = 0; j < NumElts; ++j) {
2750 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2751 DAG.getIntPtrConstant(j, dl));
2752 // We've loaded i1 as an i8 and now must truncate it back to i1
2753 if (EltVT == MVT::i1)
2754 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2755 // v2f16 was loaded as an i32. Now we must bitcast it back.
2756 else if (EltVT == MVT::v2f16)
2757 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2758
2759 // If a promoted integer type is used, truncate down to the original
2760 MVT PromotedVT;
2761 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
2762 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
2763 }
2764
2765 // Extend the element if necessary (e.g. an i8 is loaded
2766 // into an i16 register)
2767 if (Ins[InsIdx].VT.isInteger() &&
2768 Ins[InsIdx].VT.getFixedSizeInBits() >
2769 LoadVT.getFixedSizeInBits()) {
2770 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2772 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2773 }
2774 InVals.push_back(Elt);
2775 }
2776
2777 // Reset vector tracking state.
2778 VecIdx = -1;
2779 }
2780 ++InsIdx;
2781 }
2782 if (VTs.size() > 0)
2783 --InsIdx;
2784 continue;
2785 }
2786
2787 // Param has ByVal attribute
2788 // Return MoveParam(param symbol).
2789 // Ideally, the param symbol can be returned directly,
2790 // but when SDNode builder decides to use it in a CopyToReg(),
2791 // machine instruction fails because TargetExternalSymbol
2792 // (not lowered) is target dependent, and CopyToReg assumes
2793 // the source is lowered.
2794 EVT ObjectVT = getValueType(DL, Ty);
2795 assert(ObjectVT == Ins[InsIdx].VT &&
2796 "Ins type did not match function type");
2797 SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2798 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2799 if (p.getNode())
2800 p.getNode()->setIROrder(idx + 1);
2801 InVals.push_back(p);
2802 }
2803
2804 if (!OutChains.empty())
2805 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2806
2807 return Chain;
2808}
2809
2810SDValue
2812 bool isVarArg,
2814 const SmallVectorImpl<SDValue> &OutVals,
2815 const SDLoc &dl, SelectionDAG &DAG) const {
2816 const MachineFunction &MF = DAG.getMachineFunction();
2817 const Function &F = MF.getFunction();
2819
2820 bool isABI = (STI.getSmVersion() >= 20);
2821 assert(isABI && "Non-ABI compilation is not supported");
2822 if (!isABI)
2823 return Chain;
2824
2825 const DataLayout &DL = DAG.getDataLayout();
2826 SmallVector<SDValue, 16> PromotedOutVals;
2829 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2830 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2831
2832 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2833 SDValue PromotedOutVal = OutVals[i];
2834 MVT PromotedVT;
2835 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
2836 VTs[i] = EVT(PromotedVT);
2837 }
2838 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
2840 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2841 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
2842 }
2843 PromotedOutVals.push_back(PromotedOutVal);
2844 }
2845
2846 auto VectorInfo = VectorizePTXValueVTs(
2847 VTs, Offsets,
2849 : Align(1));
2850
2851 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2852 // 32-bits are sign extended or zero extended, depending on whether
2853 // they are signed or unsigned types.
2854 bool ExtendIntegerRetVal =
2855 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2856
2857 SmallVector<SDValue, 6> StoreOperands;
2858 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2859 // New load/store. Record chain and offset operands.
2860 if (VectorInfo[i] & PVF_FIRST) {
2861 assert(StoreOperands.empty() && "Orphaned operand list.");
2862 StoreOperands.push_back(Chain);
2863 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2864 }
2865
2866 SDValue OutVal = OutVals[i];
2867 SDValue RetVal = PromotedOutVals[i];
2868
2869 if (ExtendIntegerRetVal) {
2870 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2872 dl, MVT::i32, RetVal);
2873 } else if (OutVal.getValueSizeInBits() < 16) {
2874 // Use 16-bit registers for small load-stores as it's the
2875 // smallest general purpose register size supported by NVPTX.
2876 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2877 }
2878
2879 // Record the value to return.
2880 StoreOperands.push_back(RetVal);
2881
2882 // That's the last element of this store op.
2883 if (VectorInfo[i] & PVF_LAST) {
2885 unsigned NumElts = StoreOperands.size() - 2;
2886 switch (NumElts) {
2887 case 1:
2889 break;
2890 case 2:
2892 break;
2893 case 4:
2895 break;
2896 default:
2897 llvm_unreachable("Invalid vector info.");
2898 }
2899
2900 // Adjust type of load/store op if we've extended the scalar
2901 // return value.
2902 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2903 Chain = DAG.getMemIntrinsicNode(
2904 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
2906 // Cleanup vector state.
2907 StoreOperands.clear();
2908 }
2909 }
2910
2911 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2912}
2913
2915 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2916 SelectionDAG &DAG) const {
2917 if (Constraint.length() > 1)
2918 return;
2919 else
2920 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2921}
2922
2923static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2924 switch (Intrinsic) {
2925 default:
2926 return 0;
2927
2928 case Intrinsic::nvvm_tex_1d_v4f32_s32:
2930 case Intrinsic::nvvm_tex_1d_v4f32_f32:
2932 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2934 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2936 case Intrinsic::nvvm_tex_1d_v4s32_s32:
2937 return NVPTXISD::Tex1DS32S32;
2938 case Intrinsic::nvvm_tex_1d_v4s32_f32:
2940 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2942 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2944 case Intrinsic::nvvm_tex_1d_v4u32_s32:
2945 return NVPTXISD::Tex1DU32S32;
2946 case Intrinsic::nvvm_tex_1d_v4u32_f32:
2948 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2950 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2952
2953 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2955 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2957 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2959 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2961 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2963 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2965 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2967 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2969 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2971 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2973 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2975 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2977
2978 case Intrinsic::nvvm_tex_2d_v4f32_s32:
2980 case Intrinsic::nvvm_tex_2d_v4f32_f32:
2982 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2984 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2986 case Intrinsic::nvvm_tex_2d_v4s32_s32:
2987 return NVPTXISD::Tex2DS32S32;
2988 case Intrinsic::nvvm_tex_2d_v4s32_f32:
2990 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2992 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2994 case Intrinsic::nvvm_tex_2d_v4u32_s32:
2995 return NVPTXISD::Tex2DU32S32;
2996 case Intrinsic::nvvm_tex_2d_v4u32_f32:
2998 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3000 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3002
3003 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3005 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3007 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3009 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3011 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3013 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3015 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3017 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3019 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3021 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3023 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3025 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3027
3028 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3030 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3032 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3034 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3036 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3037 return NVPTXISD::Tex3DS32S32;
3038 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3040 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3042 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3044 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3045 return NVPTXISD::Tex3DU32S32;
3046 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3048 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3050 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3052
3053 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3055 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3057 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3059 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3061 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3063 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3065
3066 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3068 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3070 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3072 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3074 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3076 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3078
3079 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3081 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3083 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3085 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3087 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3089 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3091 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3093 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3095 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3097 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3099 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3101 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3103
3104 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3106 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3108 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3110 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3112 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3114 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3116 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3118 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3120 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3122 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3124 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3126 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3128
3129 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3131 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3133 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3135 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3137 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3139 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3141 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3143 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3145 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3147 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3149 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3151 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3153
3154 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3156 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3158 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3160 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3162 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3164 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3166 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3168 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3170 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3172 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3174 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3176 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3178
3179 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3181 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3183 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3185 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3187 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3189 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3191 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3193 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3195 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3197 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3199 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3201 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3203
3204 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3206 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3208 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3210 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3212 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3214 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3216 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3218 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3220 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3222 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3224 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3226 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3228
3229 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3231 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3233 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3235 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3237 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3239 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3241
3242 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3244 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3246 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3248 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3250 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3252 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3254
3255 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3257 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3259 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3261 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3263 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3265 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3267 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3269 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3271 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3273 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3275 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3277 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3279 }
3280}
3281
3282static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3283 switch (Intrinsic) {
3284 default:
3285 return 0;
3286 case Intrinsic::nvvm_suld_1d_i8_clamp:
3288 case Intrinsic::nvvm_suld_1d_i16_clamp:
3290 case Intrinsic::nvvm_suld_1d_i32_clamp:
3292 case Intrinsic::nvvm_suld_1d_i64_clamp:
3294 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3296 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3298 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3300 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3302 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3304 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3306 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3308 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3310 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3312 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3314 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3316 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3318 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3320 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3322 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3324 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3326 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3328 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3330 case Intrinsic::nvvm_suld_2d_i8_clamp:
3332 case Intrinsic::nvvm_suld_2d_i16_clamp:
3334 case Intrinsic::nvvm_suld_2d_i32_clamp:
3336 case Intrinsic::nvvm_suld_2d_i64_clamp:
3338 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3340 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3342 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3344 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3346 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3348 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3350 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3352 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3354 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3356 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3358 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3360 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3362 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3364 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3366 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3368 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3370 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3372 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3374 case Intrinsic::nvvm_suld_3d_i8_clamp:
3376 case Intrinsic::nvvm_suld_3d_i16_clamp:
3378 case Intrinsic::nvvm_suld_3d_i32_clamp:
3380 case Intrinsic::nvvm_suld_3d_i64_clamp:
3382 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3384 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3386 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3388 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3390 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3392 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3394 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3396 case Intrinsic::nvvm_suld_1d_i8_trap:
3398 case Intrinsic::nvvm_suld_1d_i16_trap:
3400 case Intrinsic::nvvm_suld_1d_i32_trap:
3402 case Intrinsic::nvvm_suld_1d_i64_trap:
3404 case Intrinsic::nvvm_suld_1d_v2i8_trap:
3406 case Intrinsic::nvvm_suld_1d_v2i16_trap:
3408 case Intrinsic::nvvm_suld_1d_v2i32_trap:
3410 case Intrinsic::nvvm_suld_1d_v2i64_trap:
3412 case Intrinsic::nvvm_suld_1d_v4i8_trap:
3414 case Intrinsic::nvvm_suld_1d_v4i16_trap:
3416 case Intrinsic::nvvm_suld_1d_v4i32_trap:
3418 case Intrinsic::nvvm_suld_1d_array_i8_trap:
3420 case Intrinsic::nvvm_suld_1d_array_i16_trap:
3422 case Intrinsic::nvvm_suld_1d_array_i32_trap:
3424 case Intrinsic::nvvm_suld_1d_array_i64_trap:
3426 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3428 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3430 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3432 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3434 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3436 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3438 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3440 case Intrinsic::nvvm_suld_2d_i8_trap:
3442 case Intrinsic::nvvm_suld_2d_i16_trap:
3444 case Intrinsic::nvvm_suld_2d_i32_trap:
3446 case Intrinsic::nvvm_suld_2d_i64_trap:
3448 case Intrinsic::nvvm_suld_2d_v2i8_trap:
3450 case Intrinsic::nvvm_suld_2d_v2i16_trap:
3452 case Intrinsic::nvvm_suld_2d_v2i32_trap:
3454 case Intrinsic::nvvm_suld_2d_v2i64_trap:
3456 case Intrinsic::nvvm_suld_2d_v4i8_trap:
3458 case Intrinsic::nvvm_suld_2d_v4i16_trap:
3460 case Intrinsic::nvvm_suld_2d_v4i32_trap:
3462 case Intrinsic::nvvm_suld_2d_array_i8_trap:
3464 case Intrinsic::nvvm_suld_2d_array_i16_trap:
3466 case Intrinsic::nvvm_suld_2d_array_i32_trap:
3468 case Intrinsic::nvvm_suld_2d_array_i64_trap:
3470 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3472 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3474 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3476 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3478 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3480 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3482 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3484 case Intrinsic::nvvm_suld_3d_i8_trap:
3486 case Intrinsic::nvvm_suld_3d_i16_trap:
3488 case Intrinsic::nvvm_suld_3d_i32_trap:
3490 case Intrinsic::nvvm_suld_3d_i64_trap:
3492 case Intrinsic::nvvm_suld_3d_v2i8_trap:
3494 case Intrinsic::nvvm_suld_3d_v2i16_trap:
3496 case Intrinsic::nvvm_suld_3d_v2i32_trap:
3498 case Intrinsic::nvvm_suld_3d_v2i64_trap:
3500 case Intrinsic::nvvm_suld_3d_v4i8_trap:
3502 case Intrinsic::nvvm_suld_3d_v4i16_trap:
3504 case Intrinsic::nvvm_suld_3d_v4i32_trap:
3506 case Intrinsic::nvvm_suld_1d_i8_zero:
3508 case Intrinsic::nvvm_suld_1d_i16_zero:
3510 case Intrinsic::nvvm_suld_1d_i32_zero:
3512 case Intrinsic::nvvm_suld_1d_i64_zero:
3514 case Intrinsic::nvvm_suld_1d_v2i8_zero:
3516 case Intrinsic::nvvm_suld_1d_v2i16_zero:
3518 case Intrinsic::nvvm_suld_1d_v2i32_zero:
3520 case Intrinsic::nvvm_suld_1d_v2i64_zero:
3522 case Intrinsic::nvvm_suld_1d_v4i8_zero:
3524 case Intrinsic::nvvm_suld_1d_v4i16_zero:
3526 case Intrinsic::nvvm_suld_1d_v4i32_zero:
3528 case Intrinsic::nvvm_suld_1d_array_i8_zero:
3530 case Intrinsic::nvvm_suld_1d_array_i16_zero:
3532 case Intrinsic::nvvm_suld_1d_array_i32_zero:
3534 case Intrinsic::nvvm_suld_1d_array_i64_zero:
3536 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3538 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3540 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3542 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3544 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3546 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3548 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3550 case Intrinsic::nvvm_suld_2d_i8_zero:
3552 case Intrinsic::nvvm_suld_2d_i16_zero:
3554 case Intrinsic::nvvm_suld_2d_i32_zero:
3556 case Intrinsic::nvvm_suld_2d_i64_zero:
3558 case Intrinsic::nvvm_suld_2d_v2i8_zero:
3560 case Intrinsic::nvvm_suld_2d_v2i16_zero:
3562 case Intrinsic::nvvm_suld_2d_v2i32_zero:
3564 case Intrinsic::nvvm_suld_2d_v2i64_zero:
3566 case Intrinsic::nvvm_suld_2d_v4i8_zero:
3568 case Intrinsic::nvvm_suld_2d_v4i16_zero:
3570 case Intrinsic::nvvm_suld_2d_v4i32_zero:
3572 case Intrinsic::nvvm_suld_2d_array_i8_zero:
3574 case Intrinsic::nvvm_suld_2d_array_i16_zero:
3576 case Intrinsic::nvvm_suld_2d_array_i32_zero:
3578 case Intrinsic::nvvm_suld_2d_array_i64_zero:
3580 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3582 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3584 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3586 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3588 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3590 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3592 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3594 case Intrinsic::nvvm_suld_3d_i8_zero:
3596 case Intrinsic::nvvm_suld_3d_i16_zero:
3598 case Intrinsic::nvvm_suld_3d_i32_zero:
3600 case Intrinsic::nvvm_suld_3d_i64_zero:
3602 case Intrinsic::nvvm_suld_3d_v2i8_zero:
3604 case Intrinsic::nvvm_suld_3d_v2i16_zero:
3606 case Intrinsic::nvvm_suld_3d_v2i32_zero:
3608 case Intrinsic::nvvm_suld_3d_v2i64_zero:
3610 case Intrinsic::nvvm_suld_3d_v4i8_zero:
3612 case Intrinsic::nvvm_suld_3d_v4i16_zero:
3614 case Intrinsic::nvvm_suld_3d_v4i32_zero:
3616 }
3617}
3618
3619// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3620// TgtMemIntrinsic
3621// because we need the information that is only available in the "Value" type
3622// of destination
3623// pointer. In particular, the address space information.
3625 IntrinsicInfo &Info, const CallInst &I,
3626 MachineFunction &MF, unsigned Intrinsic) const {
3627 switch (Intrinsic) {
3628 default:
3629 return false;
3630 case Intrinsic::nvvm_match_all_sync_i32p:
3631 case Intrinsic::nvvm_match_all_sync_i64p:
3633 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3634 // in order to model data exchange with other threads, but perform no real
3635 // memory accesses.
3636 Info.memVT = MVT::i1;
3637
3638 // Our result depends on both our and other thread's arguments.
3640 return true;
3641 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3642 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3643 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3644 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3645 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3646 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3647 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3648 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3649 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3650 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3651 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3652 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3653 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3654 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3655 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3656 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3657 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3658 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3659 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3660 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3661 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3662 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3663 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3664 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3666 Info.memVT = MVT::v8f16;
3667 Info.ptrVal = I.getArgOperand(0);
3668 Info.offset = 0;
3670 Info.align = Align(16);
3671 return true;
3672 }
3673 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3674 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3675 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3676 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3677 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3678 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3679 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3680 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3681 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3682 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3683 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3684 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3685 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3686 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3687 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3688 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3689 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3690 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3691 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3692 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3693 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3694 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3695 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3696 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3698 Info.memVT = MVT::v2i32;
3699 Info.ptrVal = I.getArgOperand(0);
3700 Info.offset = 0;
3702 Info.align = Align(8);
3703 return true;
3704 }
3705
3706 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3707 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3708 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3709 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3710 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3711 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3712 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3713 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3714 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3715 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3716 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3717 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3718 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3719 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3720 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3721 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3722
3723 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3724 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3725 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3726 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3727 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3728 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3729 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3730 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3731 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3732 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3733 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3734 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3735 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3736 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3737 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3738 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3739 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3740 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3742 Info.memVT = MVT::v4i32;
3743 Info.ptrVal = I.getArgOperand(0);
3744 Info.offset = 0;
3746 Info.align = Align(16);
3747 return true;
3748 }
3749
3750 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3751 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3752 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3753 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3754 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3755 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3756 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3757 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3758
3759 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3760 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3761 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3762 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3763 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3764 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3765 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3766 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3767 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3768 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3769 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3770 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3771 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3772 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3773 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3774 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3775 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3776 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3777 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3778 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3779 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3780 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3782 Info.memVT = MVT::i32;
3783 Info.ptrVal = I.getArgOperand(0);
3784 Info.offset = 0;
3786 Info.align = Align(4);
3787 return true;
3788 }
3789
3790 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3791 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3792 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3793 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3794 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3795 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3796 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3797 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3798 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3799 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3800 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3801 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3803 Info.memVT = MVT::v4f16;
3804 Info.ptrVal = I.getArgOperand(0);
3805 Info.offset = 0;
3807 Info.align = Align(16);
3808 return true;
3809 }
3810
3811 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3812 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3813 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3814 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3815 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3816 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3817 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3818 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3819 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3820 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3821 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3822 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3823 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3824 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3825 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3826 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3828 Info.memVT = MVT::v8f32;
3829 Info.ptrVal = I.getArgOperand(0);
3830 Info.offset = 0;
3832 Info.align = Align(16);
3833 return true;
3834 }
3835
3836 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3837 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3838 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3839 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3840
3841 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3842 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3843 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3844 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3845
3846 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3847 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3848 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3849 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3850 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3851 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3852 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3853 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3854 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3855 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3856 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3857 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3859 Info.memVT = MVT::v8i32;
3860 Info.ptrVal = I.getArgOperand(0);
3861 Info.offset = 0;
3863 Info.align = Align(16);
3864 return true;
3865 }
3866
3867 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3868 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3869 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3870 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3871 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3872 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3873 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3874 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3875 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3876 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
3878 Info.memVT = MVT::v2i32;
3879 Info.ptrVal = I.getArgOperand(0);
3880 Info.offset = 0;
3882 Info.align = Align(8);
3883 return true;
3884 }
3885
3886 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3887 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3888 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3889 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3890
3891 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3892 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3893 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3894 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3896 Info.memVT = MVT::f64;
3897 Info.ptrVal = I.getArgOperand(0);
3898 Info.offset = 0;
3900 Info.align = Align(8);
3901 return true;
3902 }
3903
3904 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3905 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3906 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3907 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3909 Info.memVT = MVT::v2f64;
3910 Info.ptrVal = I.getArgOperand(0);
3911 Info.offset = 0;
3913 Info.align = Align(16);
3914 return true;
3915 }
3916
3917 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3918 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3919 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3920 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3921 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3922 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3923 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3924 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3925 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3926 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3927 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3928 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3930 Info.memVT = MVT::v4f16;
3931 Info.ptrVal = I.getArgOperand(0);
3932 Info.offset = 0;
3934 Info.align = Align(16);
3935 return true;
3936 }
3937
3938 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3939 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3940 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3941 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3942 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3943 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3944 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3945 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3946 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3947 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3948 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3949 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3950 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3951 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3952 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3953 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3955 Info.memVT = MVT::v8f32;
3956 Info.ptrVal = I.getArgOperand(0);
3957 Info.offset = 0;
3959 Info.align = Align(16);
3960 return true;
3961 }
3962
3963 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3964 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3965 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3966 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3967 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3968 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3969 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3970 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3971 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3972 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3973 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3974 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3976 Info.memVT = MVT::v8i32;
3977 Info.ptrVal = I.getArgOperand(0);
3978 Info.offset = 0;
3980 Info.align = Align(16);
3981 return true;
3982 }
3983
3984 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3985 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3986 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3987 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3988 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3989 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3990 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3991 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3993 Info.memVT = MVT::v2i32;
3994 Info.ptrVal = I.getArgOperand(0);
3995 Info.offset = 0;
3997 Info.align = Align(8);
3998 return true;
3999 }
4000
4001 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4002 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4003 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4004 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4006 Info.memVT = MVT::v2f64;
4007 Info.ptrVal = I.getArgOperand(0);
4008 Info.offset = 0;
4010 Info.align = Align(16);
4011 return true;
4012 }
4013
4014 case Intrinsic::nvvm_atomic_load_inc_32:
4015 case Intrinsic::nvvm_atomic_load_dec_32:
4016
4017 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4018 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4019 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4020 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4021 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4022 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4023 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4024 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4025 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4026 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4027 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4028 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4029 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4030 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4031 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4032 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4033 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4034 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4035 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4036 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4037 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4038 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4039 auto &DL = I.getModule()->getDataLayout();
4041 Info.memVT = getValueType(DL, I.getType());
4042 Info.ptrVal = I.getArgOperand(0);
4043 Info.offset = 0;
4045 Info.align.reset();
4046 return true;
4047 }
4048
4049 case Intrinsic::nvvm_ldu_global_i:
4050 case Intrinsic::nvvm_ldu_global_f:
4051 case Intrinsic::nvvm_ldu_global_p: {
4052 auto &DL = I.getModule()->getDataLayout();
4054 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4055 Info.memVT = getValueType(DL, I.getType());
4056 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4057 Info.memVT = getPointerTy(DL);
4058 else
4059 Info.memVT = getValueType(DL, I.getType());
4060 Info.ptrVal = I.getArgOperand(0);
4061 Info.offset = 0;
4063 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4064
4065 return true;
4066 }
4067 case Intrinsic::nvvm_ldg_global_i:
4068 case Intrinsic::nvvm_ldg_global_f:
4069 case Intrinsic::nvvm_ldg_global_p: {
4070 auto &DL = I.getModule()->getDataLayout();
4071
4073 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4074 Info.memVT = getValueType(DL, I.getType());
4075 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4076 Info.memVT = getPointerTy(DL);
4077 else
4078 Info.memVT = getValueType(DL, I.getType());
4079 Info.ptrVal = I.getArgOperand(0);
4080 Info.offset = 0;
4082 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4083
4084 return true;
4085 }
4086
4087 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4088 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4089 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4090 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4091 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4092 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4093 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4094 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4095 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4096 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4097 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4098 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4099 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4100 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4101 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4102 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4103 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4104 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4105 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4106 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4107 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4108 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4109 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4110 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4111 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4112 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4113 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4114 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4115 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4116 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4117 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4118 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4119 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4120 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4121 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4122 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4123 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4124 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4125 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4126 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4127 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4128 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4129 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4130 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4131 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4132 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4133 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4134 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4135 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4136 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4137 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4138 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4139 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4140 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4141 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4142 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4143 Info.opc = getOpcForTextureInstr(Intrinsic);
4144 Info.memVT = MVT::v4f32;
4145 Info.ptrVal = nullptr;
4146 Info.offset = 0;
4148 Info.align = Align(16);
4149 return true;
4150
4151 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4152 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4153 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4154 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4155 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4156 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4157 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4158 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4159 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4160 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4161 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4162 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4163 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4164 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4165 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4166 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4167 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4168 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4169 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4170 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4171 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4172 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4173 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4174 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4175 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4176 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4177 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4178 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4179 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4180 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4181 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4182 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4183 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4184 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4185 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4186 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4187 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4188 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4189 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4190 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4191 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4192 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4193 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4194 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4195 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4196 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4197 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4198 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4199 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4200 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4201 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4202 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4203 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4204 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4205 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4206 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4207 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4208 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4209 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4210 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4211 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4212 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4213 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4214 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: