LLVM  9.0.0svn
NVPTXISelLowering.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "NVPTXISelLowering.h"
17 #include "NVPTX.h"
18 #include "NVPTXSubtarget.h"
19 #include "NVPTXTargetMachine.h"
20 #include "NVPTXTargetObjectFile.h"
21 #include "NVPTXUtilities.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
33 #include "llvm/IR/Argument.h"
34 #include "llvm/IR/Attributes.h"
35 #include "llvm/IR/CallSite.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/IR/GlobalValue.h"
41 #include "llvm/IR/Instruction.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/Value.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CodeGen.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <sstream>
60 #include <string>
61 #include <utility>
62 #include <vector>
63 
64 #define DEBUG_TYPE "nvptx-lower"
65 
66 using namespace llvm;
67 
68 static unsigned int uniqueCallSite = 0;
69 
71  "nvptx-sched4reg",
72  cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
73 
74 static cl::opt<unsigned>
76  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
77  " 1: do it 2: do it aggressively"),
78  cl::init(2));
79 
81  "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
82  cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
83  " IEEE Compliant F32 div.rnd if available."),
84  cl::init(2));
85 
87  "nvptx-prec-sqrtf32", cl::Hidden,
88  cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
89  cl::init(true));
90 
92  "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
93  cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
94  cl::init(false));
95 
97  if (UsePrecDivF32.getNumOccurrences() > 0) {
98  // If nvptx-prec-div32=N is used on the command-line, always honor it
99  return UsePrecDivF32;
100  } else {
101  // Otherwise, use div.approx if fast math is enabled
103  return 0;
104  else
105  return 2;
106  }
107 }
108 
110  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
111  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
112  return UsePrecSqrtF32;
113  } else {
114  // Otherwise, use sqrt.approx if fast math is enabled
116  }
117 }
118 
120  // TODO: Get rid of this flag; there can be only one way to do this.
121  if (FtzEnabled.getNumOccurrences() > 0) {
122  // If nvptx-f32ftz is used on the command-line, always honor it
123  return FtzEnabled;
124  } else {
125  const Function &F = MF.getFunction();
126  // Otherwise, check for an nvptx-f32ftz attribute on the function
127  if (F.hasFnAttribute("nvptx-f32ftz"))
128  return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
129  else
130  return false;
131  }
132 }
133 
134 static bool IsPTXVectorType(MVT VT) {
135  switch (VT.SimpleTy) {
136  default:
137  return false;
138  case MVT::v2i1:
139  case MVT::v4i1:
140  case MVT::v2i8:
141  case MVT::v4i8:
142  case MVT::v2i16:
143  case MVT::v4i16:
144  case MVT::v2i32:
145  case MVT::v4i32:
146  case MVT::v2i64:
147  case MVT::v2f16:
148  case MVT::v4f16:
149  case MVT::v8f16: // <4 x f16x2>
150  case MVT::v2f32:
151  case MVT::v4f32:
152  case MVT::v2f64:
153  return true;
154  }
155 }
156 
157 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
158 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
159 /// into their primitive components.
160 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
161 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
162 /// LowerCall, and LowerReturn.
163 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
164  Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
166  uint64_t StartingOffset = 0) {
167  SmallVector<EVT, 16> TempVTs;
168  SmallVector<uint64_t, 16> TempOffsets;
169 
170  // Special case for i128 - decompose to (i64, i64)
171  if (Ty->isIntegerTy(128)) {
172  ValueVTs.push_back(EVT(MVT::i64));
173  ValueVTs.push_back(EVT(MVT::i64));
174 
175  if (Offsets) {
176  Offsets->push_back(StartingOffset + 0);
177  Offsets->push_back(StartingOffset + 8);
178  }
179 
180  return;
181  }
182 
183  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
184  if (StructType *STy = dyn_cast<StructType>(Ty)) {
185  auto const *SL = DL.getStructLayout(STy);
186  auto ElementNum = 0;
187  for(auto *EI : STy->elements()) {
188  ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
189  StartingOffset + SL->getElementOffset(ElementNum));
190  ++ElementNum;
191  }
192  return;
193  }
194 
195  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
196  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
197  EVT VT = TempVTs[i];
198  uint64_t Off = TempOffsets[i];
199  // Split vectors into individual elements, except for v2f16, which
200  // we will pass as a single scalar.
201  if (VT.isVector()) {
202  unsigned NumElts = VT.getVectorNumElements();
203  EVT EltVT = VT.getVectorElementType();
204  // Vectors with an even number of f16 elements will be passed to
205  // us as an array of v2f16 elements. We must match this so we
206  // stay in sync with Ins/Outs.
207  if (EltVT == MVT::f16 && NumElts % 2 == 0) {
208  EltVT = MVT::v2f16;
209  NumElts /= 2;
210  }
211  for (unsigned j = 0; j != NumElts; ++j) {
212  ValueVTs.push_back(EltVT);
213  if (Offsets)
214  Offsets->push_back(Off + j * EltVT.getStoreSize());
215  }
216  } else {
217  ValueVTs.push_back(VT);
218  if (Offsets)
219  Offsets->push_back(Off);
220  }
221  }
222 }
223 
224 // Check whether we can merge loads/stores of some of the pieces of a
225 // flattened function parameter or return value into a single vector
226 // load/store.
227 //
228 // The flattened parameter is represented as a list of EVTs and
229 // offsets, and the whole structure is aligned to ParamAlignment. This
230 // function determines whether we can load/store pieces of the
231 // parameter starting at index Idx using a single vectorized op of
232 // size AccessSize. If so, it returns the number of param pieces
233 // covered by the vector op. Otherwise, it returns 1.
235  unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
236  const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
237  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
238 
239  // Can't vectorize if param alignment is not sufficient.
240  if (AccessSize > ParamAlignment)
241  return 1;
242  // Can't vectorize if offset is not aligned.
243  if (Offsets[Idx] & (AccessSize - 1))
244  return 1;
245 
246  EVT EltVT = ValueVTs[Idx];
247  unsigned EltSize = EltVT.getStoreSize();
248 
249  // Element is too large to vectorize.
250  if (EltSize >= AccessSize)
251  return 1;
252 
253  unsigned NumElts = AccessSize / EltSize;
254  // Can't vectorize if AccessBytes if not a multiple of EltSize.
255  if (AccessSize != EltSize * NumElts)
256  return 1;
257 
258  // We don't have enough elements to vectorize.
259  if (Idx + NumElts > ValueVTs.size())
260  return 1;
261 
262  // PTX ISA can only deal with 2- and 4-element vector ops.
263  if (NumElts != 4 && NumElts != 2)
264  return 1;
265 
266  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
267  // Types do not match.
268  if (ValueVTs[j] != EltVT)
269  return 1;
270 
271  // Elements are not contiguous.
272  if (Offsets[j] - Offsets[j - 1] != EltSize)
273  return 1;
274  }
275  // OK. We can vectorize ValueVTs[i..i+NumElts)
276  return NumElts;
277 }
278 
279 // Flags for tracking per-element vectorization state of loads/stores
280 // of a flattened function parameter or return value.
282  PVF_INNER = 0x0, // Middle elements of a vector.
283  PVF_FIRST = 0x1, // First element of the vector.
284  PVF_LAST = 0x2, // Last element of the vector.
285  // Scalar is effectively a 1-element vector.
287 };
288 
289 // Computes whether and how we can vectorize the loads/stores of a
290 // flattened function parameter or return value.
291 //
292 // The flattened parameter is represented as the list of ValueVTs and
293 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
294 // of the same size as ValueVTs indicating how each piece should be
295 // loaded/stored (i.e. as a scalar, or as part of a vector
296 // load/store).
300  unsigned ParamAlignment) {
301  // Set vector size to match ValueVTs and mark all elements as
302  // scalars by default.
304  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
305 
306  // Check what we can vectorize using 128/64/32-bit accesses.
307  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
308  // Skip elements we've already processed.
309  assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
310  for (unsigned AccessSize : {16, 8, 4, 2}) {
311  unsigned NumElts = CanMergeParamLoadStoresStartingAt(
312  I, AccessSize, ValueVTs, Offsets, ParamAlignment);
313  // Mark vectorized elements.
314  switch (NumElts) {
315  default:
316  llvm_unreachable("Unexpected return value");
317  case 1:
318  // Can't vectorize using this size, try next smaller size.
319  continue;
320  case 2:
321  assert(I + 1 < E && "Not enough elements.");
322  VectorInfo[I] = PVF_FIRST;
323  VectorInfo[I + 1] = PVF_LAST;
324  I += 1;
325  break;
326  case 4:
327  assert(I + 3 < E && "Not enough elements.");
328  VectorInfo[I] = PVF_FIRST;
329  VectorInfo[I + 1] = PVF_INNER;
330  VectorInfo[I + 2] = PVF_INNER;
331  VectorInfo[I + 3] = PVF_LAST;
332  I += 3;
333  break;
334  }
335  // Break out of the inner loop because we've already succeeded
336  // using largest possible AccessSize.
337  break;
338  }
339  }
340  return VectorInfo;
341 }
342 
343 // NVPTXTargetLowering Constructor.
345  const NVPTXSubtarget &STI)
346  : TargetLowering(TM), nvTM(&TM), STI(STI) {
347  // always lower memset, memcpy, and memmove intrinsics to load/store
348  // instructions, rather
349  // then generating calls to memset, mempcy or memmove.
350  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
351  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
352  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
353 
356 
357  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
358  // condition branches.
359  setJumpIsExpensive(true);
360 
361  // Wide divides are _very_ slow. Try to reduce the width of the divide if
362  // possible.
363  addBypassSlowDiv(64, 32);
364 
365  // By default, use the Source scheduling
366  if (sched4reg)
368  else
370 
371  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
372  LegalizeAction NoF16Action) {
373  setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
374  };
375 
376  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
377  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
378  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
379  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
380  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
381  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
382  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
383  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
384 
385  // Conversion to/from FP16/FP16x2 is always legal.
392 
393  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
394  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
395 
396  // Operations not directly supported by NVPTX.
401  }
402 
403  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
404  // For others we will expand to a SHL/SRA pair.
410 
417 
420 
421  // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
422  // that don't have h/w rotation we lower them to multi-instruction assembly.
423  // See ROT*_sw in NVPTXIntrInfo.td
428 
436 
437  // Indirect branch is not supported.
438  // This also disables Jump Table creation.
441 
444 
445  // We want to legalize constant related memmove and memcopy
446  // intrinsics.
448 
449  // Turn FP extload into load/fpextend
459  // Turn FP truncstore into trunc + store.
460  // FIXME: vector types should also be expanded
464 
465  // PTX does not support load / store predicate registers
468 
469  for (MVT VT : MVT::integer_valuetypes()) {
473  }
474 
475  // This is legal in NVPTX
479 
480  // TRAP can be lowered to PTX trap
482 
483  // Register custom handling for vector loads/stores
484  for (MVT VT : MVT::vector_valuetypes()) {
485  if (IsPTXVectorType(VT)) {
489  }
490  }
491 
492  // Custom handling for i8 intrinsics
494 
495  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
501 
504  }
505 
509 
510  // PTX does not directly support SELP of i1, so promote to i32 first
512 
513  // PTX cannot multiply two i64s in a single instruction.
516 
517  // We have some custom DAG combine patterns for these nodes
525 
526  // setcc for f16x2 needs special handling to prevent legalizer's
527  // attempt to scalarize it due to v2i1 not being legal.
528  if (STI.allowFP16Math())
530 
531  // Promote fp16 arithmetic if fp16 hardware isn't available or the
532  // user passed --nvptx-no-fp16-math. The flag is useful because,
533  // although sm_53+ GPUs have some sort of FP16 support in
534  // hardware, only sm_53 and sm_60 have full implementation. Others
535  // only have token amount of hardware and are likely to run faster
536  // by using fp32 units instead.
537  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
538  setFP16OperationAction(Op, MVT::f16, Legal, Promote);
539  setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
540  }
541 
542  // There's no neg.f16 instruction. Expand to (0-x).
545 
546  // (would be) Library functions.
547 
548  // These map to conversion instructions for scalar FP types.
549  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
555  }
556 
557  // 'Expand' implements FCOPYSIGN without calling an external library.
562 
563  // These map to corresponding instructions for f32/f64. f16 must be
564  // promoted to f32. v2f16 is expanded to f16, which is then promoted
565  // to f32.
566  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
572  }
577 
578  // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
579  // No FPOW or FREM in PTX.
580 
581  // Now deduce the information based on the above mentioned
582  // actions
584 }
585 
586 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
587  switch ((NVPTXISD::NodeType)Opcode) {
589  break;
590  case NVPTXISD::CALL:
591  return "NVPTXISD::CALL";
592  case NVPTXISD::RET_FLAG:
593  return "NVPTXISD::RET_FLAG";
595  return "NVPTXISD::LOAD_PARAM";
596  case NVPTXISD::Wrapper:
597  return "NVPTXISD::Wrapper";
599  return "NVPTXISD::DeclareParam";
601  return "NVPTXISD::DeclareScalarParam";
603  return "NVPTXISD::DeclareRet";
605  return "NVPTXISD::DeclareScalarRet";
607  return "NVPTXISD::DeclareRetParam";
608  case NVPTXISD::PrintCall:
609  return "NVPTXISD::PrintCall";
611  return "NVPTXISD::PrintConvergentCall";
613  return "NVPTXISD::PrintCallUni";
615  return "NVPTXISD::PrintConvergentCallUni";
616  case NVPTXISD::LoadParam:
617  return "NVPTXISD::LoadParam";
619  return "NVPTXISD::LoadParamV2";
621  return "NVPTXISD::LoadParamV4";
623  return "NVPTXISD::StoreParam";
625  return "NVPTXISD::StoreParamV2";
627  return "NVPTXISD::StoreParamV4";
629  return "NVPTXISD::StoreParamS32";
631  return "NVPTXISD::StoreParamU32";
633  return "NVPTXISD::CallArgBegin";
634  case NVPTXISD::CallArg:
635  return "NVPTXISD::CallArg";
637  return "NVPTXISD::LastCallArg";
639  return "NVPTXISD::CallArgEnd";
640  case NVPTXISD::CallVoid:
641  return "NVPTXISD::CallVoid";
642  case NVPTXISD::CallVal:
643  return "NVPTXISD::CallVal";
645  return "NVPTXISD::CallSymbol";
646  case NVPTXISD::Prototype:
647  return "NVPTXISD::Prototype";
648  case NVPTXISD::MoveParam:
649  return "NVPTXISD::MoveParam";
651  return "NVPTXISD::StoreRetval";
653  return "NVPTXISD::StoreRetvalV2";
655  return "NVPTXISD::StoreRetvalV4";
657  return "NVPTXISD::PseudoUseParam";
658  case NVPTXISD::RETURN:
659  return "NVPTXISD::RETURN";
661  return "NVPTXISD::CallSeqBegin";
663  return "NVPTXISD::CallSeqEnd";
665  return "NVPTXISD::CallPrototype";
666  case NVPTXISD::ProxyReg:
667  return "NVPTXISD::ProxyReg";
668  case NVPTXISD::LoadV2:
669  return "NVPTXISD::LoadV2";
670  case NVPTXISD::LoadV4:
671  return "NVPTXISD::LoadV4";
672  case NVPTXISD::LDGV2:
673  return "NVPTXISD::LDGV2";
674  case NVPTXISD::LDGV4:
675  return "NVPTXISD::LDGV4";
676  case NVPTXISD::LDUV2:
677  return "NVPTXISD::LDUV2";
678  case NVPTXISD::LDUV4:
679  return "NVPTXISD::LDUV4";
680  case NVPTXISD::StoreV2:
681  return "NVPTXISD::StoreV2";
682  case NVPTXISD::StoreV4:
683  return "NVPTXISD::StoreV4";
685  return "NVPTXISD::FUN_SHFL_CLAMP";
687  return "NVPTXISD::FUN_SHFR_CLAMP";
688  case NVPTXISD::IMAD:
689  return "NVPTXISD::IMAD";
691  return "NVPTXISD::SETP_F16X2";
692  case NVPTXISD::Dummy:
693  return "NVPTXISD::Dummy";
695  return "NVPTXISD::MUL_WIDE_SIGNED";
697  return "NVPTXISD::MUL_WIDE_UNSIGNED";
698  case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
699  case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
701  return "NVPTXISD::Tex1DFloatFloatLevel";
703  return "NVPTXISD::Tex1DFloatFloatGrad";
704  case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
705  case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
707  return "NVPTXISD::Tex1DS32FloatLevel";
709  return "NVPTXISD::Tex1DS32FloatGrad";
710  case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
711  case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
713  return "NVPTXISD::Tex1DU32FloatLevel";
715  return "NVPTXISD::Tex1DU32FloatGrad";
716  case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
717  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
719  return "NVPTXISD::Tex1DArrayFloatFloatLevel";
721  return "NVPTXISD::Tex1DArrayFloatFloatGrad";
722  case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
723  case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
725  return "NVPTXISD::Tex1DArrayS32FloatLevel";
727  return "NVPTXISD::Tex1DArrayS32FloatGrad";
728  case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
729  case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
731  return "NVPTXISD::Tex1DArrayU32FloatLevel";
733  return "NVPTXISD::Tex1DArrayU32FloatGrad";
734  case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
735  case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
737  return "NVPTXISD::Tex2DFloatFloatLevel";
739  return "NVPTXISD::Tex2DFloatFloatGrad";
740  case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
741  case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
743  return "NVPTXISD::Tex2DS32FloatLevel";
745  return "NVPTXISD::Tex2DS32FloatGrad";
746  case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
747  case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
749  return "NVPTXISD::Tex2DU32FloatLevel";
751  return "NVPTXISD::Tex2DU32FloatGrad";
752  case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
753  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
755  return "NVPTXISD::Tex2DArrayFloatFloatLevel";
757  return "NVPTXISD::Tex2DArrayFloatFloatGrad";
758  case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
759  case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
761  return "NVPTXISD::Tex2DArrayS32FloatLevel";
763  return "NVPTXISD::Tex2DArrayS32FloatGrad";
764  case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
765  case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
767  return "NVPTXISD::Tex2DArrayU32FloatLevel";
769  return "NVPTXISD::Tex2DArrayU32FloatGrad";
770  case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
771  case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
773  return "NVPTXISD::Tex3DFloatFloatLevel";
775  return "NVPTXISD::Tex3DFloatFloatGrad";
776  case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
777  case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
779  return "NVPTXISD::Tex3DS32FloatLevel";
781  return "NVPTXISD::Tex3DS32FloatGrad";
782  case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
783  case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
785  return "NVPTXISD::Tex3DU32FloatLevel";
787  return "NVPTXISD::Tex3DU32FloatGrad";
788  case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
790  return "NVPTXISD::TexCubeFloatFloatLevel";
791  case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
793  return "NVPTXISD::TexCubeS32FloatLevel";
794  case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
796  return "NVPTXISD::TexCubeU32FloatLevel";
798  return "NVPTXISD::TexCubeArrayFloatFloat";
800  return "NVPTXISD::TexCubeArrayFloatFloatLevel";
802  return "NVPTXISD::TexCubeArrayS32Float";
804  return "NVPTXISD::TexCubeArrayS32FloatLevel";
806  return "NVPTXISD::TexCubeArrayU32Float";
808  return "NVPTXISD::TexCubeArrayU32FloatLevel";
810  return "NVPTXISD::Tld4R2DFloatFloat";
812  return "NVPTXISD::Tld4G2DFloatFloat";
814  return "NVPTXISD::Tld4B2DFloatFloat";
816  return "NVPTXISD::Tld4A2DFloatFloat";
818  return "NVPTXISD::Tld4R2DS64Float";
820  return "NVPTXISD::Tld4G2DS64Float";
822  return "NVPTXISD::Tld4B2DS64Float";
824  return "NVPTXISD::Tld4A2DS64Float";
826  return "NVPTXISD::Tld4R2DU64Float";
828  return "NVPTXISD::Tld4G2DU64Float";
830  return "NVPTXISD::Tld4B2DU64Float";
832  return "NVPTXISD::Tld4A2DU64Float";
833 
835  return "NVPTXISD::TexUnified1DFloatS32";
837  return "NVPTXISD::TexUnified1DFloatFloat";
839  return "NVPTXISD::TexUnified1DFloatFloatLevel";
841  return "NVPTXISD::TexUnified1DFloatFloatGrad";
843  return "NVPTXISD::TexUnified1DS32S32";
845  return "NVPTXISD::TexUnified1DS32Float";
847  return "NVPTXISD::TexUnified1DS32FloatLevel";
849  return "NVPTXISD::TexUnified1DS32FloatGrad";
851  return "NVPTXISD::TexUnified1DU32S32";
853  return "NVPTXISD::TexUnified1DU32Float";
855  return "NVPTXISD::TexUnified1DU32FloatLevel";
857  return "NVPTXISD::TexUnified1DU32FloatGrad";
859  return "NVPTXISD::TexUnified1DArrayFloatS32";
861  return "NVPTXISD::TexUnified1DArrayFloatFloat";
863  return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
865  return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
867  return "NVPTXISD::TexUnified1DArrayS32S32";
869  return "NVPTXISD::TexUnified1DArrayS32Float";
871  return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
873  return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
875  return "NVPTXISD::TexUnified1DArrayU32S32";
877  return "NVPTXISD::TexUnified1DArrayU32Float";
879  return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
881  return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
883  return "NVPTXISD::TexUnified2DFloatS32";
885  return "NVPTXISD::TexUnified2DFloatFloat";
887  return "NVPTXISD::TexUnified2DFloatFloatLevel";
889  return "NVPTXISD::TexUnified2DFloatFloatGrad";
891  return "NVPTXISD::TexUnified2DS32S32";
893  return "NVPTXISD::TexUnified2DS32Float";
895  return "NVPTXISD::TexUnified2DS32FloatLevel";
897  return "NVPTXISD::TexUnified2DS32FloatGrad";
899  return "NVPTXISD::TexUnified2DU32S32";
901  return "NVPTXISD::TexUnified2DU32Float";
903  return "NVPTXISD::TexUnified2DU32FloatLevel";
905  return "NVPTXISD::TexUnified2DU32FloatGrad";
907  return "NVPTXISD::TexUnified2DArrayFloatS32";
909  return "NVPTXISD::TexUnified2DArrayFloatFloat";
911  return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
913  return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
915  return "NVPTXISD::TexUnified2DArrayS32S32";
917  return "NVPTXISD::TexUnified2DArrayS32Float";
919  return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
921  return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
923  return "NVPTXISD::TexUnified2DArrayU32S32";
925  return "NVPTXISD::TexUnified2DArrayU32Float";
927  return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
929  return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
931  return "NVPTXISD::TexUnified3DFloatS32";
933  return "NVPTXISD::TexUnified3DFloatFloat";
935  return "NVPTXISD::TexUnified3DFloatFloatLevel";
937  return "NVPTXISD::TexUnified3DFloatFloatGrad";
939  return "NVPTXISD::TexUnified3DS32S32";
941  return "NVPTXISD::TexUnified3DS32Float";
943  return "NVPTXISD::TexUnified3DS32FloatLevel";
945  return "NVPTXISD::TexUnified3DS32FloatGrad";
947  return "NVPTXISD::TexUnified3DU32S32";
949  return "NVPTXISD::TexUnified3DU32Float";
951  return "NVPTXISD::TexUnified3DU32FloatLevel";
953  return "NVPTXISD::TexUnified3DU32FloatGrad";
955  return "NVPTXISD::TexUnifiedCubeFloatFloat";
957  return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
959  return "NVPTXISD::TexUnifiedCubeS32Float";
961  return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
963  return "NVPTXISD::TexUnifiedCubeU32Float";
965  return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
967  return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
969  return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
971  return "NVPTXISD::TexUnifiedCubeArrayS32Float";
973  return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
975  return "NVPTXISD::TexUnifiedCubeArrayU32Float";
977  return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
979  return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
981  return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
983  return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
985  return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
987  return "NVPTXISD::Tld4UnifiedR2DS64Float";
989  return "NVPTXISD::Tld4UnifiedG2DS64Float";
991  return "NVPTXISD::Tld4UnifiedB2DS64Float";
993  return "NVPTXISD::Tld4UnifiedA2DS64Float";
995  return "NVPTXISD::Tld4UnifiedR2DU64Float";
997  return "NVPTXISD::Tld4UnifiedG2DU64Float";
999  return "NVPTXISD::Tld4UnifiedB2DU64Float";
1001  return "NVPTXISD::Tld4UnifiedA2DU64Float";
1002 
1003  case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1004  case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1005  case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1006  case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1007  case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1008  case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1009  case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1010  case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1011  case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1012  case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1013  case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1014 
1015  case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1016  case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1017  case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1018  case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1019  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1020  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1021  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1022  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1023  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1024  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1025  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1026 
1027  case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1028  case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1029  case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1030  case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1031  case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1032  case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1033  case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1034  case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1035  case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1036  case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1037  case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1038 
1039  case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1040  case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1041  case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1042  case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1043  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1044  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1045  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1046  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1047  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1048  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1049  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1050 
1051  case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1052  case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1053  case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1054  case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1055  case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1056  case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1057  case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1058  case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1059  case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1060  case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1061  case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1062 
1063  case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1064  case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1065  case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1066  case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1067  case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1068  case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1069  case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1070  case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1071  case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1072  case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1073  case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1074 
1075  case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1076  case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1077  case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1078  case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1079  case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1080  case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1081  case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1082  case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1083  case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1084  case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1085  case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1086 
1087  case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1088  case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1089  case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1090  case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1091  case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1092  case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1093  case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1094  case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1095  case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1096  case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1097  case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1098 
1099  case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1100  case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1101  case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1102  case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1103  case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1104  case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1105  case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1106  case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1107  case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1108  case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1109  case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1110 
1111  case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1112  case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1113  case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1114  case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1115  case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1116  case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1117  case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1118  case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1119  case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1120  case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1121  case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1122 
1123  case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1124  case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1125  case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1126  case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1127  case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1128  case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1129  case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1130  case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1131  case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1132  case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1133  case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1134 
1135  case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1136  case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1137  case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1138  case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1139  case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1140  case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1141  case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1142  case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1143  case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1144  case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1145  case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1146 
1147  case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1148  case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1149  case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1150  case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1151  case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1152  case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1153  case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1154  case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1155  case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1156  case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1157  case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1158 
1159  case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1160  case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1161  case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1162  case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1163  case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1164  case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1165  case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1166  case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1167  case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1168  case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1169  case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1170 
1171  case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1172  case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1173  case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1174  case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1175  case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1176  case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1177  case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1178  case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1179  case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1180  case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1181  case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1182  }
1183  return nullptr;
1184 }
1185 
1188  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1189  return TypeSplitVector;
1190  if (VT == MVT::v2f16)
1191  return TypeLegal;
1193 }
1194 
1196  int Enabled, int &ExtraSteps,
1197  bool &UseOneConst,
1198  bool Reciprocal) const {
1199  if (!(Enabled == ReciprocalEstimate::Enabled ||
1200  (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1201  return SDValue();
1202 
1203  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1204  ExtraSteps = 0;
1205 
1206  SDLoc DL(Operand);
1207  EVT VT = Operand.getValueType();
1208  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1209 
1210  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1211  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1212  DAG.getConstant(IID, DL, MVT::i32), Operand);
1213  };
1214 
1215  // The sqrt and rsqrt refinement processes assume we always start out with an
1216  // approximation of the rsqrt. Therefore, if we're going to do any refinement
1217  // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1218  // any refinement, we must return a regular sqrt.
1219  if (Reciprocal || ExtraSteps > 0) {
1220  if (VT == MVT::f32)
1221  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1222  : Intrinsic::nvvm_rsqrt_approx_f);
1223  else if (VT == MVT::f64)
1224  return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1225  else
1226  return SDValue();
1227  } else {
1228  if (VT == MVT::f32)
1229  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1230  : Intrinsic::nvvm_sqrt_approx_f);
1231  else {
1232  // There's no sqrt.approx.f64 instruction, so we emit
1233  // reciprocal(rsqrt(x)). This is faster than
1234  // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1235  // x * rsqrt(x).)
1236  return DAG.getNode(
1237  ISD::INTRINSIC_WO_CHAIN, DL, VT,
1238  DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1239  MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1240  }
1241  }
1242 }
1243 
1244 SDValue
1246  SDLoc dl(Op);
1247  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1248  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1249  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1250  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1251 }
1252 
1254  const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1255  const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1256  ImmutableCallSite CS) const {
1257  auto PtrVT = getPointerTy(DL);
1258 
1259  bool isABI = (STI.getSmVersion() >= 20);
1260  assert(isABI && "Non-ABI compilation is not supported");
1261  if (!isABI)
1262  return "";
1263 
1264  std::stringstream O;
1265  O << "prototype_" << uniqueCallSite << " : .callprototype ";
1266 
1267  if (retTy->getTypeID() == Type::VoidTyID) {
1268  O << "()";
1269  } else {
1270  O << "(";
1271  if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1272  unsigned size = 0;
1273  if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1274  size = ITy->getBitWidth();
1275  } else {
1276  assert(retTy->isFloatingPointTy() &&
1277  "Floating point type expected here");
1278  size = retTy->getPrimitiveSizeInBits();
1279  }
1280  // PTX ABI requires all scalar return values to be at least 32
1281  // bits in size. fp16 normally uses .b16 as its storage type in
1282  // PTX, so its size must be adjusted here, too.
1283  if (size < 32)
1284  size = 32;
1285 
1286  O << ".param .b" << size << " _";
1287  } else if (isa<PointerType>(retTy)) {
1288  O << ".param .b" << PtrVT.getSizeInBits() << " _";
1289  } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
1290  auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
1291  O << ".param .align " << retAlignment << " .b8 _["
1292  << DL.getTypeAllocSize(retTy) << "]";
1293  } else {
1294  llvm_unreachable("Unknown return type");
1295  }
1296  O << ") ";
1297  }
1298  O << "_ (";
1299 
1300  bool first = true;
1301 
1302  unsigned OIdx = 0;
1303  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1304  Type *Ty = Args[i].Ty;
1305  if (!first) {
1306  O << ", ";
1307  }
1308  first = false;
1309 
1310  if (!Outs[OIdx].Flags.isByVal()) {
1311  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1312  unsigned align = 0;
1313  const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1314  // +1 because index 0 is reserved for return type alignment
1315  if (!getAlign(*CallI, i + 1, align))
1316  align = DL.getABITypeAlignment(Ty);
1317  unsigned sz = DL.getTypeAllocSize(Ty);
1318  O << ".param .align " << align << " .b8 ";
1319  O << "_";
1320  O << "[" << sz << "]";
1321  // update the index for Outs
1322  SmallVector<EVT, 16> vtparts;
1323  ComputeValueVTs(*this, DL, Ty, vtparts);
1324  if (unsigned len = vtparts.size())
1325  OIdx += len - 1;
1326  continue;
1327  }
1328  // i8 types in IR will be i16 types in SDAG
1329  assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1330  (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1331  "type mismatch between callee prototype and arguments");
1332  // scalar type
1333  unsigned sz = 0;
1334  if (isa<IntegerType>(Ty)) {
1335  sz = cast<IntegerType>(Ty)->getBitWidth();
1336  if (sz < 32)
1337  sz = 32;
1338  } else if (isa<PointerType>(Ty)) {
1339  sz = PtrVT.getSizeInBits();
1340  } else if (Ty->isHalfTy())
1341  // PTX ABI requires all scalar parameters to be at least 32
1342  // bits in size. fp16 normally uses .b16 as its storage type
1343  // in PTX, so its size must be adjusted here, too.
1344  sz = 32;
1345  else
1346  sz = Ty->getPrimitiveSizeInBits();
1347  O << ".param .b" << sz << " ";
1348  O << "_";
1349  continue;
1350  }
1351  auto *PTy = dyn_cast<PointerType>(Ty);
1352  assert(PTy && "Param with byval attribute should be a pointer type");
1353  Type *ETy = PTy->getElementType();
1354 
1355  unsigned align = Outs[OIdx].Flags.getByValAlign();
1356  unsigned sz = DL.getTypeAllocSize(ETy);
1357  O << ".param .align " << align << " .b8 ";
1358  O << "_";
1359  O << "[" << sz << "]";
1360  }
1361  O << ");";
1362  return O.str();
1363 }
1364 
1365 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1366  ImmutableCallSite CS,
1367  Type *Ty, unsigned Idx,
1368  const DataLayout &DL) const {
1369  if (!CS) {
1370  // CallSite is zero, fallback to ABI type alignment
1371  return DL.getABITypeAlignment(Ty);
1372  }
1373 
1374  unsigned Align = 0;
1375  const Value *DirectCallee = CS.getCalledFunction();
1376 
1377  if (!DirectCallee) {
1378  // We don't have a direct function symbol, but that may be because of
1379  // constant cast instructions in the call.
1380  const Instruction *CalleeI = CS.getInstruction();
1381  assert(CalleeI && "Call target is not a function or derived value?");
1382 
1383  // With bitcast'd call targets, the instruction will be the call
1384  if (isa<CallInst>(CalleeI)) {
1385  // Check if we have call alignment metadata
1386  if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1387  return Align;
1388 
1389  const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1390  // Ignore any bitcast instructions
1391  while (isa<ConstantExpr>(CalleeV)) {
1392  const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1393  if (!CE->isCast())
1394  break;
1395  // Look through the bitcast
1396  CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1397  }
1398 
1399  // We have now looked past all of the bitcasts. Do we finally have a
1400  // Function?
1401  if (isa<Function>(CalleeV))
1402  DirectCallee = CalleeV;
1403  }
1404  }
1405 
1406  // Check for function alignment information if we found that the
1407  // ultimate target is a Function
1408  if (DirectCallee)
1409  if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1410  return Align;
1411 
1412  // Call is indirect or alignment information is not available, fall back to
1413  // the ABI type alignment
1414  return DL.getABITypeAlignment(Ty);
1415 }
1416 
1418  SmallVectorImpl<SDValue> &InVals) const {
1419  SelectionDAG &DAG = CLI.DAG;
1420  SDLoc dl = CLI.DL;
1422  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1424  SDValue Chain = CLI.Chain;
1425  SDValue Callee = CLI.Callee;
1426  bool &isTailCall = CLI.IsTailCall;
1427  ArgListTy &Args = CLI.getArgs();
1428  Type *RetTy = CLI.RetTy;
1429  ImmutableCallSite CS = CLI.CS;
1430  const DataLayout &DL = DAG.getDataLayout();
1431 
1432  bool isABI = (STI.getSmVersion() >= 20);
1433  assert(isABI && "Non-ABI compilation is not supported");
1434  if (!isABI)
1435  return Chain;
1436 
1437  SDValue tempChain = Chain;
1438  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1439  SDValue InFlag = Chain.getValue(1);
1440 
1441  unsigned paramCount = 0;
1442  // Args.size() and Outs.size() need not match.
1443  // Outs.size() will be larger
1444  // * if there is an aggregate argument with multiple fields (each field
1445  // showing up separately in Outs)
1446  // * if there is a vector argument with more than typical vector-length
1447  // elements (generally if more than 4) where each vector element is
1448  // individually present in Outs.
1449  // So a different index should be used for indexing into Outs/OutVals.
1450  // See similar issue in LowerFormalArguments.
1451  unsigned OIdx = 0;
1452  // Declare the .params or .reg need to pass values
1453  // to the function
1454  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1455  EVT VT = Outs[OIdx].VT;
1456  Type *Ty = Args[i].Ty;
1457 
1458  if (!Outs[OIdx].Flags.isByVal()) {
1461  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1462  unsigned ArgAlign =
1463  getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1464  unsigned AllocSize = DL.getTypeAllocSize(Ty);
1465  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1466  bool NeedAlign; // Does argument declaration specify alignment?
1467  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1468  // declare .param .align <align> .b8 .param<n>[<size>];
1469  SDValue DeclareParamOps[] = {
1470  Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1471  DAG.getConstant(paramCount, dl, MVT::i32),
1472  DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1473  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1474  DeclareParamOps);
1475  NeedAlign = true;
1476  } else {
1477  // declare .param .b<size> .param<n>;
1478  if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1479  // PTX ABI requires integral types to be at least 32 bits in
1480  // size. FP16 is loaded/stored using i16, so it's handled
1481  // here as well.
1482  AllocSize = 4;
1483  }
1484  SDValue DeclareScalarParamOps[] = {
1485  Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1486  DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1487  DAG.getConstant(0, dl, MVT::i32), InFlag};
1488  Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1489  DeclareScalarParamOps);
1490  NeedAlign = false;
1491  }
1492  InFlag = Chain.getValue(1);
1493 
1494  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1495  // than 32-bits are sign extended or zero extended, depending on
1496  // whether they are signed or unsigned types. This case applies
1497  // only to scalar parameters and not to aggregate values.
1498  bool ExtendIntegerParam =
1499  Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1500 
1501  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1502  SmallVector<SDValue, 6> StoreOperands;
1503  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1504  // New store.
1505  if (VectorInfo[j] & PVF_FIRST) {
1506  assert(StoreOperands.empty() && "Unfinished preceeding store.");
1507  StoreOperands.push_back(Chain);
1508  StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1509  StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1510  }
1511 
1512  EVT EltVT = VTs[j];
1513  SDValue StVal = OutVals[OIdx];
1514  if (ExtendIntegerParam) {
1515  assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1516  // zext/sext to i32
1517  StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1518  : ISD::ZERO_EXTEND,
1519  dl, MVT::i32, StVal);
1520  } else if (EltVT.getSizeInBits() < 16) {
1521  // Use 16-bit registers for small stores as it's the
1522  // smallest general purpose register size supported by NVPTX.
1523  StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1524  }
1525 
1526  // Record the value to store.
1527  StoreOperands.push_back(StVal);
1528 
1529  if (VectorInfo[j] & PVF_LAST) {
1530  unsigned NumElts = StoreOperands.size() - 3;
1532  switch (NumElts) {
1533  case 1:
1534  Op = NVPTXISD::StoreParam;
1535  break;
1536  case 2:
1538  break;
1539  case 4:
1541  break;
1542  default:
1543  llvm_unreachable("Invalid vector info.");
1544  }
1545 
1546  StoreOperands.push_back(InFlag);
1547 
1548  // Adjust type of the store op if we've extended the scalar
1549  // return value.
1550  EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1551  unsigned EltAlign =
1552  NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
1553 
1554  Chain = DAG.getMemIntrinsicNode(
1555  Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1556  TheStoreType, MachinePointerInfo(), EltAlign,
1558  InFlag = Chain.getValue(1);
1559 
1560  // Cleanup.
1561  StoreOperands.clear();
1562  }
1563  ++OIdx;
1564  }
1565  assert(StoreOperands.empty() && "Unfinished parameter store.");
1566  if (VTs.size() > 0)
1567  --OIdx;
1568  ++paramCount;
1569  continue;
1570  }
1571 
1572  // ByVal arguments
1575  auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1576  assert(PTy && "Type of a byval parameter should be pointer");
1577  ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1578 
1579  // declare .param .align <align> .b8 .param<n>[<size>];
1580  unsigned sz = Outs[OIdx].Flags.getByValSize();
1581  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1582  unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1583  // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1584  // so we don't need to worry about natural alignment or not.
1585  // See TargetLowering::LowerCallTo().
1586 
1587  // Enforce minumum alignment of 4 to work around ptxas miscompile
1588  // for sm_50+. See corresponding alignment adjustment in
1589  // emitFunctionParamList() for details.
1590  if (ArgAlign < 4)
1591  ArgAlign = 4;
1592  SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1593  DAG.getConstant(paramCount, dl, MVT::i32),
1594  DAG.getConstant(sz, dl, MVT::i32), InFlag};
1595  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1596  DeclareParamOps);
1597  InFlag = Chain.getValue(1);
1598  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1599  EVT elemtype = VTs[j];
1600  int curOffset = Offsets[j];
1601  unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1602  auto PtrVT = getPointerTy(DL);
1603  SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1604  DAG.getConstant(curOffset, dl, PtrVT));
1605  SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1606  MachinePointerInfo(), PartAlign);
1607  if (elemtype.getSizeInBits() < 16) {
1608  theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1609  }
1610  SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1611  SDValue CopyParamOps[] = { Chain,
1612  DAG.getConstant(paramCount, dl, MVT::i32),
1613  DAG.getConstant(curOffset, dl, MVT::i32),
1614  theVal, InFlag };
1615  Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1616  CopyParamOps, elemtype,
1617  MachinePointerInfo(), /* Align */ 0,
1619 
1620  InFlag = Chain.getValue(1);
1621  }
1622  ++paramCount;
1623  }
1624 
1626  unsigned retAlignment = 0;
1627 
1628  // Handle Result
1629  if (Ins.size() > 0) {
1630  SmallVector<EVT, 16> resvtparts;
1631  ComputeValueVTs(*this, DL, RetTy, resvtparts);
1632 
1633  // Declare
1634  // .param .align 16 .b8 retval0[<size-in-bytes>], or
1635  // .param .b<size-in-bits> retval0
1636  unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1637  // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1638  // these three types to match the logic in
1639  // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1640  // Plus, this behavior is consistent with nvcc's.
1641  if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1642  (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1643  // Scalar needs to be at least 32bit wide
1644  if (resultsz < 32)
1645  resultsz = 32;
1646  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1647  SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1648  DAG.getConstant(resultsz, dl, MVT::i32),
1649  DAG.getConstant(0, dl, MVT::i32), InFlag };
1650  Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1651  DeclareRetOps);
1652  InFlag = Chain.getValue(1);
1653  } else {
1654  retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1655  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1656  SDValue DeclareRetOps[] = { Chain,
1657  DAG.getConstant(retAlignment, dl, MVT::i32),
1658  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1659  DAG.getConstant(0, dl, MVT::i32), InFlag };
1660  Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1661  DeclareRetOps);
1662  InFlag = Chain.getValue(1);
1663  }
1664  }
1665 
1666  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1667  // between them we must rely on the call site value which is valid for
1668  // indirect calls but is always null for libcalls.
1669  bool isIndirectCall = !Func && CS;
1670 
1671  if (isa<ExternalSymbolSDNode>(Callee)) {
1672  Function* CalleeFunc = nullptr;
1673 
1674  // Try to find the callee in the current module.
1675  Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1676  assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1677 
1678  // Set the "libcall callee" attribute to indicate that the function
1679  // must always have a declaration.
1680  CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1681  }
1682 
1683  if (isIndirectCall) {
1684  // This is indirect function call case : PTX requires a prototype of the
1685  // form
1686  // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1687  // to be emitted, and the label has to used as the last arg of call
1688  // instruction.
1689  // The prototype is embedded in a string and put as the operand for a
1690  // CallPrototype SDNode which will print out to the value of the string.
1691  SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1692  std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1693  const char *ProtoStr =
1694  nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1695  SDValue ProtoOps[] = {
1696  Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1697  };
1698  Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1699  InFlag = Chain.getValue(1);
1700  }
1701  // Op to just print "call"
1702  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1703  SDValue PrintCallOps[] = {
1704  Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1705  };
1706  // We model convergent calls as separate opcodes.
1707  unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1708  if (CLI.IsConvergent)
1711  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1712  InFlag = Chain.getValue(1);
1713 
1714  // Ops to print out the function name
1715  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1716  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1717  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1718  InFlag = Chain.getValue(1);
1719 
1720  // Ops to print out the param list
1721  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1722  SDValue CallArgBeginOps[] = { Chain, InFlag };
1723  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1724  CallArgBeginOps);
1725  InFlag = Chain.getValue(1);
1726 
1727  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1728  unsigned opcode;
1729  if (i == (e - 1))
1730  opcode = NVPTXISD::LastCallArg;
1731  else
1732  opcode = NVPTXISD::CallArg;
1733  SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1734  SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1735  DAG.getConstant(i, dl, MVT::i32), InFlag };
1736  Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1737  InFlag = Chain.getValue(1);
1738  }
1739  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1740  SDValue CallArgEndOps[] = { Chain,
1741  DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1742  InFlag };
1743  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1744  InFlag = Chain.getValue(1);
1745 
1746  if (isIndirectCall) {
1747  SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1748  SDValue PrototypeOps[] = { Chain,
1750  InFlag };
1751  Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1752  InFlag = Chain.getValue(1);
1753  }
1754 
1755  SmallVector<SDValue, 16> ProxyRegOps;
1756  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1757 
1758  // Generate loads from param memory/moves from registers for result
1759  if (Ins.size() > 0) {
1762  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1763  assert(VTs.size() == Ins.size() && "Bad value decomposition");
1764 
1765  unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1766  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1767 
1768  SmallVector<EVT, 6> LoadVTs;
1769  int VecIdx = -1; // Index of the first element of the vector.
1770 
1771  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1772  // 32-bits are sign extended or zero extended, depending on whether
1773  // they are signed or unsigned types.
1774  bool ExtendIntegerRetVal =
1775  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1776 
1777  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1778  bool needTruncate = false;
1779  EVT TheLoadType = VTs[i];
1780  EVT EltType = Ins[i].VT;
1781  unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1782  if (ExtendIntegerRetVal) {
1783  TheLoadType = MVT::i32;
1784  EltType = MVT::i32;
1785  needTruncate = true;
1786  } else if (TheLoadType.getSizeInBits() < 16) {
1787  if (VTs[i].isInteger())
1788  needTruncate = true;
1789  EltType = MVT::i16;
1790  }
1791 
1792  // Record index of the very first element of the vector.
1793  if (VectorInfo[i] & PVF_FIRST) {
1794  assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1795  VecIdx = i;
1796  }
1797 
1798  LoadVTs.push_back(EltType);
1799 
1800  if (VectorInfo[i] & PVF_LAST) {
1801  unsigned NumElts = LoadVTs.size();
1802  LoadVTs.push_back(MVT::Other);
1803  LoadVTs.push_back(MVT::Glue);
1805  switch (NumElts) {
1806  case 1:
1807  Op = NVPTXISD::LoadParam;
1808  break;
1809  case 2:
1810  Op = NVPTXISD::LoadParamV2;
1811  break;
1812  case 4:
1813  Op = NVPTXISD::LoadParamV4;
1814  break;
1815  default:
1816  llvm_unreachable("Invalid vector info.");
1817  }
1818 
1819  SDValue LoadOperands[] = {
1820  Chain, DAG.getConstant(1, dl, MVT::i32),
1821  DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1822  SDValue RetVal = DAG.getMemIntrinsicNode(
1823  Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1824  MachinePointerInfo(), EltAlign,
1826 
1827  for (unsigned j = 0; j < NumElts; ++j) {
1828  ProxyRegOps.push_back(RetVal.getValue(j));
1829 
1830  if (needTruncate)
1831  ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1832  else
1833  ProxyRegTruncates.push_back(Optional<MVT>());
1834  }
1835 
1836  Chain = RetVal.getValue(NumElts);
1837  InFlag = RetVal.getValue(NumElts + 1);
1838 
1839  // Cleanup
1840  VecIdx = -1;
1841  LoadVTs.clear();
1842  }
1843  }
1844  }
1845 
1846  Chain = DAG.getCALLSEQ_END(Chain,
1847  DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1848  DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1849  true),
1850  InFlag, dl);
1851  InFlag = Chain.getValue(1);
1852  uniqueCallSite++;
1853 
1854  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1855  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1856  // dangling.
1857  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1858  SDValue Ret = DAG.getNode(
1859  NVPTXISD::ProxyReg, dl,
1860  DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1861  { Chain, ProxyRegOps[i], InFlag }
1862  );
1863 
1864  Chain = Ret.getValue(1);
1865  InFlag = Ret.getValue(2);
1866 
1867  if (ProxyRegTruncates[i].hasValue()) {
1868  Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1869  }
1870 
1871  InVals.push_back(Ret);
1872  }
1873 
1874  // set isTailCall to false for now, until we figure out how to express
1875  // tail call optimization in PTX
1876  isTailCall = false;
1877  return Chain;
1878 }
1879 
1880 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1881 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1882 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1883 SDValue
1884 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1885  SDNode *Node = Op.getNode();
1886  SDLoc dl(Node);
1888  unsigned NumOperands = Node->getNumOperands();
1889  for (unsigned i = 0; i < NumOperands; ++i) {
1890  SDValue SubOp = Node->getOperand(i);
1891  EVT VVT = SubOp.getNode()->getValueType(0);
1892  EVT EltVT = VVT.getVectorElementType();
1893  unsigned NumSubElem = VVT.getVectorNumElements();
1894  for (unsigned j = 0; j < NumSubElem; ++j) {
1895  Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1896  DAG.getIntPtrConstant(j, dl)));
1897  }
1898  }
1899  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1900 }
1901 
1902 // We can init constant f16x2 with a single .b32 move. Normally it
1903 // would get lowered as two constant loads and vector-packing move.
1904 // mov.b16 %h1, 0x4000;
1905 // mov.b16 %h2, 0x3C00;
1906 // mov.b32 %hh2, {%h2, %h1};
1907 // Instead we want just a constant move:
1908 // mov.b32 %hh2, 0x40003C00
1909 //
1910 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1911 // generates good SASS in both cases.
1912 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1913  SelectionDAG &DAG) const {
1914  //return Op;
1915  if (!(Op->getValueType(0) == MVT::v2f16 &&
1916  isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1917  isa<ConstantFPSDNode>(Op->getOperand(1))))
1918  return Op;
1919 
1920  APInt E0 =
1921  cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1922  APInt E1 =
1923  cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1924  SDValue Const =
1925  DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1926  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1927 }
1928 
1929 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1930  SelectionDAG &DAG) const {
1931  SDValue Index = Op->getOperand(1);
1932  // Constant index will be matched by tablegen.
1933  if (isa<ConstantSDNode>(Index.getNode()))
1934  return Op;
1935 
1936  // Extract individual elements and select one of them.
1937  SDValue Vector = Op->getOperand(0);
1938  EVT VectorVT = Vector.getValueType();
1939  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1940  EVT EltVT = VectorVT.getVectorElementType();
1941 
1942  SDLoc dl(Op.getNode());
1943  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1944  DAG.getIntPtrConstant(0, dl));
1945  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1946  DAG.getIntPtrConstant(1, dl));
1947  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1949 }
1950 
1951 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1952 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1953 /// amount, or
1954 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1955 /// amount.
1956 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1957  SelectionDAG &DAG) const {
1958  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1960 
1961  EVT VT = Op.getValueType();
1962  unsigned VTBits = VT.getSizeInBits();
1963  SDLoc dl(Op);
1964  SDValue ShOpLo = Op.getOperand(0);
1965  SDValue ShOpHi = Op.getOperand(1);
1966  SDValue ShAmt = Op.getOperand(2);
1967  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1968 
1969  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1970  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1971  // {dHi, dLo} = {aHi, aLo} >> Amt
1972  // dHi = aHi >> Amt
1973  // dLo = shf.r.clamp aLo, aHi, Amt
1974 
1975  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1976  SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1977  ShAmt);
1978 
1979  SDValue Ops[2] = { Lo, Hi };
1980  return DAG.getMergeValues(Ops, dl);
1981  }
1982  else {
1983  // {dHi, dLo} = {aHi, aLo} >> Amt
1984  // - if (Amt>=size) then
1985  // dLo = aHi >> (Amt-size)
1986  // dHi = aHi >> Amt (this is either all 0 or all 1)
1987  // else
1988  // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1989  // dHi = aHi >> Amt
1990 
1991  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1992  DAG.getConstant(VTBits, dl, MVT::i32),
1993  ShAmt);
1994  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1995  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1996  DAG.getConstant(VTBits, dl, MVT::i32));
1997  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1998  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1999  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2000 
2001  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2002  DAG.getConstant(VTBits, dl, MVT::i32),
2003  ISD::SETGE);
2004  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2005  SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2006 
2007  SDValue Ops[2] = { Lo, Hi };
2008  return DAG.getMergeValues(Ops, dl);
2009  }
2010 }
2011 
2012 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2013 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2014 /// amount, or
2015 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2016 /// amount.
2017 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2018  SelectionDAG &DAG) const {
2019  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2020  assert(Op.getOpcode() == ISD::SHL_PARTS);
2021 
2022  EVT VT = Op.getValueType();
2023  unsigned VTBits = VT.getSizeInBits();
2024  SDLoc dl(Op);
2025  SDValue ShOpLo = Op.getOperand(0);
2026  SDValue ShOpHi = Op.getOperand(1);
2027  SDValue ShAmt = Op.getOperand(2);
2028 
2029  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2030  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2031  // {dHi, dLo} = {aHi, aLo} << Amt
2032  // dHi = shf.l.clamp aLo, aHi, Amt
2033  // dLo = aLo << Amt
2034 
2035  SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2036  ShAmt);
2037  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2038 
2039  SDValue Ops[2] = { Lo, Hi };
2040  return DAG.getMergeValues(Ops, dl);
2041  }
2042  else {
2043  // {dHi, dLo} = {aHi, aLo} << Amt
2044  // - if (Amt>=size) then
2045  // dLo = aLo << Amt (all 0)
2046  // dLo = aLo << (Amt-size)
2047  // else
2048  // dLo = aLo << Amt
2049  // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2050 
2051  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2052  DAG.getConstant(VTBits, dl, MVT::i32),
2053  ShAmt);
2054  SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2055  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2056  DAG.getConstant(VTBits, dl, MVT::i32));
2057  SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2058  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2059  SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2060 
2061  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2062  DAG.getConstant(VTBits, dl, MVT::i32),
2063  ISD::SETGE);
2064  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2065  SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2066 
2067  SDValue Ops[2] = { Lo, Hi };
2068  return DAG.getMergeValues(Ops, dl);
2069  }
2070 }
2071 
2072 SDValue
2074  switch (Op.getOpcode()) {
2075  case ISD::RETURNADDR:
2076  return SDValue();
2077  case ISD::FRAMEADDR:
2078  return SDValue();
2079  case ISD::GlobalAddress:
2080  return LowerGlobalAddress(Op, DAG);
2082  return Op;
2083  case ISD::BUILD_VECTOR:
2084  return LowerBUILD_VECTOR(Op, DAG);
2086  return Op;
2088  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2089  case ISD::CONCAT_VECTORS:
2090  return LowerCONCAT_VECTORS(Op, DAG);
2091  case ISD::STORE:
2092  return LowerSTORE(Op, DAG);
2093  case ISD::LOAD:
2094  return LowerLOAD(Op, DAG);
2095  case ISD::SHL_PARTS:
2096  return LowerShiftLeftParts(Op, DAG);
2097  case ISD::SRA_PARTS:
2098  case ISD::SRL_PARTS:
2099  return LowerShiftRightParts(Op, DAG);
2100  case ISD::SELECT:
2101  return LowerSelect(Op, DAG);
2102  default:
2103  llvm_unreachable("Custom lowering not defined for operation");
2104  }
2105 }
2106 
2107 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2108  SDValue Op0 = Op->getOperand(0);
2109  SDValue Op1 = Op->getOperand(1);
2110  SDValue Op2 = Op->getOperand(2);
2111  SDLoc DL(Op.getNode());
2112 
2113  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2114 
2115  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2116  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2117  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2118  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2119 
2120  return Trunc;
2121 }
2122 
2123 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2124  if (Op.getValueType() == MVT::i1)
2125  return LowerLOADi1(Op, DAG);
2126 
2127  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2128  // loads and have to handle it here.
2129  if (Op.getValueType() == MVT::v2f16) {
2130  LoadSDNode *Load = cast<LoadSDNode>(Op);
2131  EVT MemVT = Load->getMemoryVT();
2132  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2133  Load->getAddressSpace(), Load->getAlignment())) {
2134  SDValue Ops[2];
2135  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2136  return DAG.getMergeValues(Ops, SDLoc(Op));
2137  }
2138  }
2139 
2140  return SDValue();
2141 }
2142 
2143 // v = ld i1* addr
2144 // =>
2145 // v1 = ld i8* addr (-> i16)
2146 // v = trunc i16 to i1
2147 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2148  SDNode *Node = Op.getNode();
2149  LoadSDNode *LD = cast<LoadSDNode>(Node);
2150  SDLoc dl(Node);
2152  assert(Node->getValueType(0) == MVT::i1 &&
2153  "Custom lowering for i1 load only");
2154  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2155  LD->getPointerInfo(), LD->getAlignment(),
2156  LD->getMemOperand()->getFlags());
2157  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2158  // The legalizer (the caller) is expecting two values from the legalized
2159  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2160  // in LegalizeDAG.cpp which also uses MergeValues.
2161  SDValue Ops[] = { result, LD->getChain() };
2162  return DAG.getMergeValues(Ops, dl);
2163 }
2164 
2165 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2166  StoreSDNode *Store = cast<StoreSDNode>(Op);
2167  EVT VT = Store->getMemoryVT();
2168 
2169  if (VT == MVT::i1)
2170  return LowerSTOREi1(Op, DAG);
2171 
2172  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2173  // stores and have to handle it here.
2174  if (VT == MVT::v2f16 &&
2175  !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2176  Store->getAddressSpace(), Store->getAlignment()))
2177  return expandUnalignedStore(Store, DAG);
2178 
2179  if (VT.isVector())
2180  return LowerSTOREVector(Op, DAG);
2181 
2182  return SDValue();
2183 }
2184 
2185 SDValue
2186 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2187  SDNode *N = Op.getNode();
2188  SDValue Val = N->getOperand(1);
2189  SDLoc DL(N);
2190  EVT ValVT = Val.getValueType();
2191 
2192  if (ValVT.isVector()) {
2193  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2194  // legal. We can (and should) split that into 2 stores of <2 x double> here
2195  // but I'm leaving that as a TODO for now.
2196  if (!ValVT.isSimple())
2197  return SDValue();
2198  switch (ValVT.getSimpleVT().SimpleTy) {
2199  default:
2200  return SDValue();
2201  case MVT::v2i8:
2202  case MVT::v2i16:
2203  case MVT::v2i32:
2204  case MVT::v2i64:
2205  case MVT::v2f16:
2206  case MVT::v2f32:
2207  case MVT::v2f64:
2208  case MVT::v4i8:
2209  case MVT::v4i16:
2210  case MVT::v4i32:
2211  case MVT::v4f16:
2212  case MVT::v4f32:
2213  case MVT::v8f16: // <4 x f16x2>
2214  // This is a "native" vector type
2215  break;
2216  }
2217 
2218  MemSDNode *MemSD = cast<MemSDNode>(N);
2219  const DataLayout &TD = DAG.getDataLayout();
2220 
2221  unsigned Align = MemSD->getAlignment();
2222  unsigned PrefAlign =
2223  TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2224  if (Align < PrefAlign) {
2225  // This store is not sufficiently aligned, so bail out and let this vector
2226  // store be scalarized. Note that we may still be able to emit smaller
2227  // vector stores. For example, if we are storing a <4 x float> with an
2228  // alignment of 8, this check will fail but the legalizer will try again
2229  // with 2 x <2 x float>, which will succeed with an alignment of 8.
2230  return SDValue();
2231  }
2232 
2233  unsigned Opcode = 0;
2234  EVT EltVT = ValVT.getVectorElementType();
2235  unsigned NumElts = ValVT.getVectorNumElements();
2236 
2237  // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2238  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2239  // stored type to i16 and propagate the "real" type as the memory type.
2240  bool NeedExt = false;
2241  if (EltVT.getSizeInBits() < 16)
2242  NeedExt = true;
2243 
2244  bool StoreF16x2 = false;
2245  switch (NumElts) {
2246  default:
2247  return SDValue();
2248  case 2:
2249  Opcode = NVPTXISD::StoreV2;
2250  break;
2251  case 4:
2252  Opcode = NVPTXISD::StoreV4;
2253  break;
2254  case 8:
2255  // v8f16 is a special case. PTX doesn't have st.v8.f16
2256  // instruction. Instead, we split the vector into v2f16 chunks and
2257  // store them with st.v4.b32.
2258  assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2259  Opcode = NVPTXISD::StoreV4;
2260  StoreF16x2 = true;
2261  break;
2262  }
2263 
2265 
2266  // First is the chain
2267  Ops.push_back(N->getOperand(0));
2268 
2269  if (StoreF16x2) {
2270  // Combine f16,f16 -> v2f16
2271  NumElts /= 2;
2272  for (unsigned i = 0; i < NumElts; ++i) {
2273  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2274  DAG.getIntPtrConstant(i * 2, DL));
2275  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2276  DAG.getIntPtrConstant(i * 2 + 1, DL));
2277  SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2278  Ops.push_back(V2);
2279  }
2280  } else {
2281  // Then the split values
2282  for (unsigned i = 0; i < NumElts; ++i) {
2283  SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2284  DAG.getIntPtrConstant(i, DL));
2285  if (NeedExt)
2286  ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2287  Ops.push_back(ExtVal);
2288  }
2289  }
2290 
2291  // Then any remaining arguments
2292  Ops.append(N->op_begin() + 2, N->op_end());
2293 
2294  SDValue NewSt =
2295  DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2296  MemSD->getMemoryVT(), MemSD->getMemOperand());
2297 
2298  // return DCI.CombineTo(N, NewSt, true);
2299  return NewSt;
2300  }
2301 
2302  return SDValue();
2303 }
2304 
2305 // st i1 v, addr
2306 // =>
2307 // v1 = zxt v to i16
2308 // st.u8 i16, addr
2309 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2310  SDNode *Node = Op.getNode();
2311  SDLoc dl(Node);
2312  StoreSDNode *ST = cast<StoreSDNode>(Node);
2313  SDValue Tmp1 = ST->getChain();
2314  SDValue Tmp2 = ST->getBasePtr();
2315  SDValue Tmp3 = ST->getValue();
2316  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2317  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2318  SDValue Result =
2319  DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2320  ST->getAlignment(), ST->getMemOperand()->getFlags());
2321  return Result;
2322 }
2323 
2324 SDValue
2325 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2326  std::string ParamSym;
2327  raw_string_ostream ParamStr(ParamSym);
2328 
2329  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2330  ParamStr.flush();
2331 
2332  std::string *SavedStr =
2333  nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2334  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2335 }
2336 
2337 // Check to see if the kernel argument is image*_t or sampler_t
2338 
2339 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2340  static const char *const specialTypes[] = { "struct._image2d_t",
2341  "struct._image3d_t",
2342  "struct._sampler_t" };
2343 
2344  Type *Ty = arg->getType();
2345  auto *PTy = dyn_cast<PointerType>(Ty);
2346 
2347  if (!PTy)
2348  return false;
2349 
2350  if (!context)
2351  return false;
2352 
2353  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2354  if (!STy || STy->isLiteral())
2355  return false;
2356 
2357  return std::find(std::begin(specialTypes), std::end(specialTypes),
2358  STy->getName()) != std::end(specialTypes);
2359 }
2360 
2362  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2363  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2364  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2365  MachineFunction &MF = DAG.getMachineFunction();
2366  const DataLayout &DL = DAG.getDataLayout();
2367  auto PtrVT = getPointerTy(DAG.getDataLayout());
2368 
2369  const Function *F = &MF.getFunction();
2370  const AttributeList &PAL = F->getAttributes();
2371  const TargetLowering *TLI = STI.getTargetLowering();
2372 
2373  SDValue Root = DAG.getRoot();
2374  std::vector<SDValue> OutChains;
2375 
2376  bool isABI = (STI.getSmVersion() >= 20);
2377  assert(isABI && "Non-ABI compilation is not supported");
2378  if (!isABI)
2379  return Chain;
2380 
2381  std::vector<Type *> argTypes;
2382  std::vector<const Argument *> theArgs;
2383  for (const Argument &I : F->args()) {
2384  theArgs.push_back(&I);
2385  argTypes.push_back(I.getType());
2386  }
2387  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2388  // Ins.size() will be larger
2389  // * if there is an aggregate argument with multiple fields (each field
2390  // showing up separately in Ins)
2391  // * if there is a vector argument with more than typical vector-length
2392  // elements (generally if more than 4) where each vector element is
2393  // individually present in Ins.
2394  // So a different index should be used for indexing into Ins.
2395  // See similar issue in LowerCall.
2396  unsigned InsIdx = 0;
2397 
2398  int idx = 0;
2399  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2400  Type *Ty = argTypes[i];
2401 
2402  // If the kernel argument is image*_t or sampler_t, convert it to
2403  // a i32 constant holding the parameter position. This can later
2404  // matched in the AsmPrinter to output the correct mangled name.
2405  if (isImageOrSamplerVal(
2406  theArgs[i],
2407  (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2408  : nullptr))) {
2409  assert(isKernelFunction(*F) &&
2410  "Only kernels can have image/sampler params");
2411  InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2412  continue;
2413  }
2414 
2415  if (theArgs[i]->use_empty()) {
2416  // argument is dead
2417  if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2418  SmallVector<EVT, 16> vtparts;
2419 
2420  ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2421  assert(vtparts.size() > 0 && "empty aggregate type not expected");
2422  for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2423  ++parti) {
2424  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2425  ++InsIdx;
2426  }
2427  if (vtparts.size() > 0)
2428  --InsIdx;
2429  continue;
2430  }
2431  if (Ty->isVectorTy()) {
2432  EVT ObjectVT = getValueType(DL, Ty);
2433  unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2434  for (unsigned parti = 0; parti < NumRegs; ++parti) {
2435  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2436  ++InsIdx;
2437  }
2438  if (NumRegs > 0)
2439  --InsIdx;
2440  continue;
2441  }
2442  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2443  continue;
2444  }
2445 
2446  // In the following cases, assign a node order of "idx+1"
2447  // to newly created nodes. The SDNodes for params have to
2448  // appear in the same order as their order of appearance
2449  // in the original function. "idx+1" holds that order.
2450  if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2451  bool aggregateIsPacked = false;
2452  if (StructType *STy = dyn_cast<StructType>(Ty))
2453  aggregateIsPacked = STy->isPacked();
2454 
2457  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2458  assert(VTs.size() > 0 && "Unexpected empty type.");
2459  auto VectorInfo =
2460  VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2461 
2462  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2463  int VecIdx = -1; // Index of the first element of the current vector.
2464  for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2465  if (VectorInfo[parti] & PVF_FIRST) {
2466  assert(VecIdx == -1 && "Orphaned vector.");
2467  VecIdx = parti;
2468  }
2469 
2470  // That's the last element of this store op.
2471  if (VectorInfo[parti] & PVF_LAST) {
2472  unsigned NumElts = parti - VecIdx + 1;
2473  EVT EltVT = VTs[parti];
2474  // i1 is loaded/stored as i8.
2475  EVT LoadVT = EltVT;
2476  if (EltVT == MVT::i1)
2477  LoadVT = MVT::i8;
2478  else if (EltVT == MVT::v2f16)
2479  // getLoad needs a vector type, but it can't handle
2480  // vectors which contain v2f16 elements. So we must load
2481  // using i32 here and then bitcast back.
2482  LoadVT = MVT::i32;
2483 
2484  EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2485  SDValue VecAddr =
2486  DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2487  DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2489  EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2490  SDValue P =
2491  DAG.getLoad(VecVT, dl, Root, VecAddr,
2492  MachinePointerInfo(srcValue), aggregateIsPacked,
2495  if (P.getNode())
2496  P.getNode()->setIROrder(idx + 1);
2497  for (unsigned j = 0; j < NumElts; ++j) {
2498  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2499  DAG.getIntPtrConstant(j, dl));
2500  // We've loaded i1 as an i8 and now must truncate it back to i1
2501  if (EltVT == MVT::i1)
2502  Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2503  // v2f16 was loaded as an i32. Now we must bitcast it back.
2504  else if (EltVT == MVT::v2f16)
2505  Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2506  // Extend the element if necessary (e.g. an i8 is loaded
2507  // into an i16 register)
2508  if (Ins[InsIdx].VT.isInteger() &&
2509  Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
2510  unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2511  : ISD::ZERO_EXTEND;
2512  Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2513  }
2514  InVals.push_back(Elt);
2515  }
2516 
2517  // Reset vector tracking state.
2518  VecIdx = -1;
2519  }
2520  ++InsIdx;
2521  }
2522  if (VTs.size() > 0)
2523  --InsIdx;
2524  continue;
2525  }
2526 
2527  // Param has ByVal attribute
2528  // Return MoveParam(param symbol).
2529  // Ideally, the param symbol can be returned directly,
2530  // but when SDNode builder decides to use it in a CopyToReg(),
2531  // machine instruction fails because TargetExternalSymbol
2532  // (not lowered) is target dependent, and CopyToReg assumes
2533  // the source is lowered.
2534  EVT ObjectVT = getValueType(DL, Ty);
2535  assert(ObjectVT == Ins[InsIdx].VT &&
2536  "Ins type did not match function type");
2537  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2538  SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2539  if (p.getNode())
2540  p.getNode()->setIROrder(idx + 1);
2541  InVals.push_back(p);
2542  }
2543 
2544  // Clang will check explicit VarArg and issue error if any. However, Clang
2545  // will let code with
2546  // implicit var arg like f() pass. See bug 617733.
2547  // We treat this case as if the arg list is empty.
2548  // if (F.isVarArg()) {
2549  // assert(0 && "VarArg not supported yet!");
2550  //}
2551 
2552  if (!OutChains.empty())
2553  DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2554 
2555  return Chain;
2556 }
2557 
2558 SDValue
2560  bool isVarArg,
2561  const SmallVectorImpl<ISD::OutputArg> &Outs,
2562  const SmallVectorImpl<SDValue> &OutVals,
2563  const SDLoc &dl, SelectionDAG &DAG) const {
2564  MachineFunction &MF = DAG.getMachineFunction();
2565  Type *RetTy = MF.getFunction().getReturnType();
2566 
2567  bool isABI = (STI.getSmVersion() >= 20);
2568  assert(isABI && "Non-ABI compilation is not supported");
2569  if (!isABI)
2570  return Chain;
2571 
2572  const DataLayout DL = DAG.getDataLayout();
2575  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2576  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2577 
2578  auto VectorInfo = VectorizePTXValueVTs(
2579  VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
2580 
2581  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2582  // 32-bits are sign extended or zero extended, depending on whether
2583  // they are signed or unsigned types.
2584  bool ExtendIntegerRetVal =
2585  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2586 
2587  SmallVector<SDValue, 6> StoreOperands;
2588  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2589  // New load/store. Record chain and offset operands.
2590  if (VectorInfo[i] & PVF_FIRST) {
2591  assert(StoreOperands.empty() && "Orphaned operand list.");
2592  StoreOperands.push_back(Chain);
2593  StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2594  }
2595 
2596  SDValue RetVal = OutVals[i];
2597  if (ExtendIntegerRetVal) {
2598  RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2599  : ISD::ZERO_EXTEND,
2600  dl, MVT::i32, RetVal);
2601  } else if (RetVal.getValueSizeInBits() < 16) {
2602  // Use 16-bit registers for small load-stores as it's the
2603  // smallest general purpose register size supported by NVPTX.
2604  RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2605  }
2606 
2607  // Record the value to return.
2608  StoreOperands.push_back(RetVal);
2609 
2610  // That's the last element of this store op.
2611  if (VectorInfo[i] & PVF_LAST) {
2613  unsigned NumElts = StoreOperands.size() - 2;
2614  switch (NumElts) {
2615  case 1:
2616  Op = NVPTXISD::StoreRetval;
2617  break;
2618  case 2:
2620  break;
2621  case 4:
2623  break;
2624  default:
2625  llvm_unreachable("Invalid vector info.");
2626  }
2627 
2628  // Adjust type of load/store op if we've extended the scalar
2629  // return value.
2630  EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2631  Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2632  StoreOperands, TheStoreType,
2633  MachinePointerInfo(), /* Align */ 1,
2635  // Cleanup vector state.
2636  StoreOperands.clear();
2637  }
2638  }
2639 
2640  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2641 }
2642 
2644  SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2645  SelectionDAG &DAG) const {
2646  if (Constraint.length() > 1)
2647  return;
2648  else
2649  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2650 }
2651 
2652 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2653  switch (Intrinsic) {
2654  default:
2655  return 0;
2656 
2657  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2658  return NVPTXISD::Tex1DFloatS32;
2659  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2661  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2663  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2665  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2666  return NVPTXISD::Tex1DS32S32;
2667  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2668  return NVPTXISD::Tex1DS32Float;
2669  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2671  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2673  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2674  return NVPTXISD::Tex1DU32S32;
2675  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2676  return NVPTXISD::Tex1DU32Float;
2677  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2679  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2681 
2682  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2684  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2686  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2688  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2690  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2692  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2694  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2696  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2698  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2700  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2702  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2704  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2706 
2707  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2708  return NVPTXISD::Tex2DFloatS32;
2709  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2711  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2713  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2715  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2716  return NVPTXISD::Tex2DS32S32;
2717  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2718  return NVPTXISD::Tex2DS32Float;
2719  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2721  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2723  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2724  return NVPTXISD::Tex2DU32S32;
2725  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2726  return NVPTXISD::Tex2DU32Float;
2727  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2729  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2731 
2732  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2734  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2736  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2738  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2740  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2742  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2744  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2746  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2748  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2750  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2752  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2754  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2756 
2757  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2758  return NVPTXISD::Tex3DFloatS32;
2759  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2761  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2763  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2765  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2766  return NVPTXISD::Tex3DS32S32;
2767  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2768  return NVPTXISD::Tex3DS32Float;
2769  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2771  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2773  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2774  return NVPTXISD::Tex3DU32S32;
2775  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2776  return NVPTXISD::Tex3DU32Float;
2777  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2779  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2781 
2782  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2784  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2786  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2788  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2790  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2792  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2794 
2795  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2797  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2799  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2801  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2803  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2805  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2807 
2808  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2810  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2812  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2814  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2816  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2818  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2820  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2822  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2824  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2826  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2828  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2830  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2832 
2833  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2835  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2837  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2839  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2841  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2843  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2845  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2847  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2849  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2851  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2853  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2855  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2857 
2858  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2860  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2862  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2864  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2866  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2868  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2870  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2872  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2874  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2876  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2878  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2880  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2882 
2883  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2885  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2887  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2889  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2891  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2893  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2895  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2897  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2899  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2901  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2903  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2905  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2907 
2908  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2910  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2912  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2914  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2916  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
2918  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
2920  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
2922  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
2924  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
2926  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
2928  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
2930  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
2932 
2933  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
2935  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
2937  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
2939  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
2941  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
2943  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
2945  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
2947  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
2949  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
2951  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
2953  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
2955  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
2957 
2958  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
2960  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
2962  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
2964  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
2966  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
2968  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
2970 
2971  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
2973  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
2975  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
2977  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
2979  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
2981  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
2983 
2984  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
2986  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
2988  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
2990  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
2992  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
2994  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
2996  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
2998  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3000  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3002  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3004  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3006  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3008  }
3009 }
3010 
3011 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3012  switch (Intrinsic) {
3013  default:
3014  return 0;
3015  case Intrinsic::nvvm_suld_1d_i8_clamp:
3016  return NVPTXISD::Suld1DI8Clamp;
3017  case Intrinsic::nvvm_suld_1d_i16_clamp:
3018  return NVPTXISD::Suld1DI16Clamp;
3019  case Intrinsic::nvvm_suld_1d_i32_clamp:
3020  return NVPTXISD::Suld1DI32Clamp;
3021  case Intrinsic::nvvm_suld_1d_i64_clamp:
3022  return NVPTXISD::Suld1DI64Clamp;
3023  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3025  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3027  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3029  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3031  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3033  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3035  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3037  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3039  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3041  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3043  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3045  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3047  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3049  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3051  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3053  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3055  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3057  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3059  case Intrinsic::nvvm_suld_2d_i8_clamp:
3060  return NVPTXISD::Suld2DI8Clamp;
3061  case Intrinsic::nvvm_suld_2d_i16_clamp:
3062  return NVPTXISD::Suld2DI16Clamp;
3063  case Intrinsic::nvvm_suld_2d_i32_clamp:
3064  return NVPTXISD::Suld2DI32Clamp;
3065  case Intrinsic::nvvm_suld_2d_i64_clamp:
3066  return NVPTXISD::Suld2DI64Clamp;
3067  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3069  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3071  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3073  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3075  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3077  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3079  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3081  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3083  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3085  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3087  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3089  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3091  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3093  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3095  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3097  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3099  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3101  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3103  case Intrinsic::nvvm_suld_3d_i8_clamp:
3104  return NVPTXISD::Suld3DI8Clamp;
3105  case Intrinsic::nvvm_suld_3d_i16_clamp:
3106  return NVPTXISD::Suld3DI16Clamp;
3107  case Intrinsic::nvvm_suld_3d_i32_clamp:
3108  return NVPTXISD::Suld3DI32Clamp;
3109  case Intrinsic::nvvm_suld_3d_i64_clamp:
3110  return NVPTXISD::Suld3DI64Clamp;
3111  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3113  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3115  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3117  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3119  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3121  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3123  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3125  case Intrinsic::nvvm_suld_1d_i8_trap:
3126  return NVPTXISD::Suld1DI8Trap;
3127  case Intrinsic::nvvm_suld_1d_i16_trap:
3128  return NVPTXISD::Suld1DI16Trap;
3129  case Intrinsic::nvvm_suld_1d_i32_trap:
3130  return NVPTXISD::Suld1DI32Trap;
3131  case Intrinsic::nvvm_suld_1d_i64_trap:
3132  return NVPTXISD::Suld1DI64Trap;
3133  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3134  return NVPTXISD::Suld1DV2I8Trap;
3135  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3137  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3139  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3141  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3142  return NVPTXISD::Suld1DV4I8Trap;
3143  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3145  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3147  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3149  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3151  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3153  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3155  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3157  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3159  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3161  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3163  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3165  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3167  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3169  case Intrinsic::nvvm_suld_2d_i8_trap:
3170  return NVPTXISD::Suld2DI8Trap;
3171  case Intrinsic::nvvm_suld_2d_i16_trap:
3172  return NVPTXISD::Suld2DI16Trap;
3173  case Intrinsic::nvvm_suld_2d_i32_trap:
3174  return NVPTXISD::Suld2DI32Trap;
3175  case Intrinsic::nvvm_suld_2d_i64_trap:
3176  return NVPTXISD::Suld2DI64Trap;
3177  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3178  return NVPTXISD::Suld2DV2I8Trap;
3179  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3181  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3183  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3185  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3186  return NVPTXISD::Suld2DV4I8Trap;
3187  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3189  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3191  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3193  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3195  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3197  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3199  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3201  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3203  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3205  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3207  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3209  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3211  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3213  case Intrinsic::nvvm_suld_3d_i8_trap:
3214  return NVPTXISD::Suld3DI8Trap;
3215  case Intrinsic::nvvm_suld_3d_i16_trap:
3216  return NVPTXISD::Suld3DI16Trap;
3217  case Intrinsic::nvvm_suld_3d_i32_trap:
3218  return NVPTXISD::Suld3DI32Trap;
3219  case Intrinsic::nvvm_suld_3d_i64_trap:
3220  return NVPTXISD::Suld3DI64Trap;
3221  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3222  return NVPTXISD::Suld3DV2I8Trap;
3223  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3225  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3227  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3229  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3230  return NVPTXISD::Suld3DV4I8Trap;
3231  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3233  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3235  case Intrinsic::nvvm_suld_1d_i8_zero:
3236  return NVPTXISD::Suld1DI8Zero;
3237  case Intrinsic::nvvm_suld_1d_i16_zero:
3238  return NVPTXISD::Suld1DI16Zero;
3239  case Intrinsic::nvvm_suld_1d_i32_zero:
3240  return NVPTXISD::Suld1DI32Zero;
3241  case Intrinsic::nvvm_suld_1d_i64_zero:
3242  return NVPTXISD::Suld1DI64Zero;
3243  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3244  return NVPTXISD::Suld1DV2I8Zero;
3245  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3247  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3249  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3251  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3252  return NVPTXISD::Suld1DV4I8Zero;
3253  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3255  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3257  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3259  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3261  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3263  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3265  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3267  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3269  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3271  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3273  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3275  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3277  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3279  case Intrinsic::nvvm_suld_2d_i8_zero:
3280  return NVPTXISD::Suld2DI8Zero;
3281  case Intrinsic::nvvm_suld_2d_i16_zero:
3282  return NVPTXISD::Suld2DI16Zero;
3283  case Intrinsic::nvvm_suld_2d_i32_zero:
3284  return NVPTXISD::Suld2DI32Zero;
3285  case Intrinsic::nvvm_suld_2d_i64_zero:
3286  return NVPTXISD::Suld2DI64Zero;
3287  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3288  return NVPTXISD::Suld2DV2I8Zero;
3289  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3291  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3293  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3295  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3296  return NVPTXISD::Suld2DV4I8Zero;
3297  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3299  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3301  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3303  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3305  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3307  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3309  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3311  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3313  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3315  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3317  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3319  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3321  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3323  case Intrinsic::nvvm_suld_3d_i8_zero:
3324  return NVPTXISD::Suld3DI8Zero;
3325  case Intrinsic::nvvm_suld_3d_i16_zero:
3326  return NVPTXISD::Suld3DI16Zero;
3327  case Intrinsic::nvvm_suld_3d_i32_zero:
3328  return NVPTXISD::Suld3DI32Zero;
3329  case Intrinsic::nvvm_suld_3d_i64_zero:
3330  return NVPTXISD::Suld3DI64Zero;
3331  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3332  return NVPTXISD::Suld3DV2I8Zero;
3333  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3335  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3337  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3339  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3340  return NVPTXISD::Suld3DV4I8Zero;
3341  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3343  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3345  }
3346 }
3347 
3348 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3349 // TgtMemIntrinsic
3350 // because we need the information that is only available in the "Value" type
3351 // of destination
3352 // pointer. In particular, the address space information.
3354  IntrinsicInfo &Info, const CallInst &I,
3355  MachineFunction &MF, unsigned Intrinsic) const {
3356  switch (Intrinsic) {
3357  default:
3358  return false;
3359  case Intrinsic::nvvm_match_all_sync_i32p:
3360  case Intrinsic::nvvm_match_all_sync_i64p:
3361  Info.opc = ISD::INTRINSIC_W_CHAIN;
3362  // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3363  // in order to model data exchange with other threads, but perform no real
3364  // memory accesses.
3365  Info.memVT = MVT::i1;
3366 
3367  // Our result depends on both our and other thread's arguments.
3369  return true;
3370  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3371  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3372  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3373  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3374  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3375  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3376  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3377  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3378  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3379  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3380  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3381  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3382  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3383  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3384  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3385  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3386  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3387  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3388  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3389  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3390  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3391  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3392  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3393  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3394  Info.opc = ISD::INTRINSIC_W_CHAIN;
3395  Info.memVT = MVT::v8f16;
3396  Info.ptrVal = I.getArgOperand(0);
3397  Info.offset = 0;
3399  Info.align = 16;
3400  return true;
3401  }
3402 
3403  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3404  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3405  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3406  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3407  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3408  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3409  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3410  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3411  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3412  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3413  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3414  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3415  Info.opc = ISD::INTRINSIC_W_CHAIN;
3416  Info.memVT = MVT::v4f16;
3417  Info.ptrVal = I.getArgOperand(0);
3418  Info.offset = 0;
3420  Info.align = 16;
3421  return true;
3422  }
3423 
3424  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3425  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3426  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3427  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3428  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3429  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3430  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3431  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3432  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3433  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3434  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3435  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3436  Info.opc = ISD::INTRINSIC_W_CHAIN;
3437  Info.memVT = MVT::v8f32;
3438  Info.ptrVal = I.getArgOperand(0);
3439  Info.offset = 0;
3441  Info.align = 16;
3442  return true;
3443  }
3444 
3445  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3446  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3447  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3448  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3449  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3450  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3451  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3452  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3453  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3454  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3455  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3456  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3457  Info.opc = ISD::INTRINSIC_VOID;
3458  Info.memVT = MVT::v4f16;
3459  Info.ptrVal = I.getArgOperand(0);
3460  Info.offset = 0;
3462  Info.align = 16;
3463  return true;
3464  }
3465 
3466  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3467  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3468  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3469  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3470  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3471  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3472  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3473  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3474  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3475  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3476  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3477  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3478  Info.opc = ISD::INTRINSIC_VOID;
3479  Info.memVT = MVT::v8f32;
3480  Info.ptrVal = I.getArgOperand(0);
3481  Info.offset = 0;
3483  Info.align = 16;
3484  return true;
3485  }
3486 
3487  case Intrinsic::nvvm_atomic_load_add_f32:
3488  case Intrinsic::nvvm_atomic_load_add_f64:
3489  case Intrinsic::nvvm_atomic_load_inc_32:
3490  case Intrinsic::nvvm_atomic_load_dec_32:
3491 
3492  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3493  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3494  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3495  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3496  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3497  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3498  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3499  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3500  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3501  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3502  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3503  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3504  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3505  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3506  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3507  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3508  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3509  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3510  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3511  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3512  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3513  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3514  auto &DL = I.getModule()->getDataLayout();
3515  Info.opc = ISD::INTRINSIC_W_CHAIN;
3516  Info.memVT = getValueType(DL, I.getType());
3517  Info.ptrVal = I.getArgOperand(0);
3518  Info.offset = 0;
3520  Info.align = 0;
3521  return true;
3522  }
3523 
3524  case Intrinsic::nvvm_ldu_global_i:
3525  case Intrinsic::nvvm_ldu_global_f:
3526  case Intrinsic::nvvm_ldu_global_p: {
3527  auto &DL = I.getModule()->getDataLayout();
3528  Info.opc = ISD::INTRINSIC_W_CHAIN;
3529  if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3530  Info.memVT = getValueType(DL, I.getType());
3531  else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3532  Info.memVT = getPointerTy(DL);
3533  else
3534  Info.memVT = getValueType(DL, I.getType());
3535  Info.ptrVal = I.getArgOperand(0);
3536  Info.offset = 0;
3538  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3539 
3540  return true;
3541  }
3542  case Intrinsic::nvvm_ldg_global_i:
3543  case Intrinsic::nvvm_ldg_global_f:
3544  case Intrinsic::nvvm_ldg_global_p: {
3545  auto &DL = I.getModule()->getDataLayout();
3546 
3547  Info.opc = ISD::INTRINSIC_W_CHAIN;
3548  if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3549  Info.memVT = getValueType(DL, I.getType());
3550  else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3551  Info.memVT = getPointerTy(DL);
3552  else
3553  Info.memVT = getValueType(DL, I.getType());
3554  Info.ptrVal = I.getArgOperand(0);
3555  Info.offset = 0;
3557  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3558 
3559  return true;
3560  }
3561 
3562  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3563  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3564  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3565  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3566  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3567  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3568  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3569  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3570  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3571  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3572  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3573  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3574  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3575  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3576  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3577  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3578  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3579  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3580  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3581  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3582  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3583  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3584  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3585  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3586  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3587  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3588  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3589  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3590  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3591  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3592  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3593  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3594  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3595  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3596  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3597  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3598  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3599  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3600  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3601  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3602  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3603  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3604  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3605  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3606  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3607  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3608  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3609  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3610  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3611  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3612  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3613  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3614  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3615  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3616  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3617  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3618  Info.opc = getOpcForTextureInstr(Intrinsic);
3619  Info.memVT = MVT::v4f32;
3620  Info.ptrVal = nullptr;
3621  Info.offset = 0;
3623  Info.align = 16;
3624  return true;
3625 
3626  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3627  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3628  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3629  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3630  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3631  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3632  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3633  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3634  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3635  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3636  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3637  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3638  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3639  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3640  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3641  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3642  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3643  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3644  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3645  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3646  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3647  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3648  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3649  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3650  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3651  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3652  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3653  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3654  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3655  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3656  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3657  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3658  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3659  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3660  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3661  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3662  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3663  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3664  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3665  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3666  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3667  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3668  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3669  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3670  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3671  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3672  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3673  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3674  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3675  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3676  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3677  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3678  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3679  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3680  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3681  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3682  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3683  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3684  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3685  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3686  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3687  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3688  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3689  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3690  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3691  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3692  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3693  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3694  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3695  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3696  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3697  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3698  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3699  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3700  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3701  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3702  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3703  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3704  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3705  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3706  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3707  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3708  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3709  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3710  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3711  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3712  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3713  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3714  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3715  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3716  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3717  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3718  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3719  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3720  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3721  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3722  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3723  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3724  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3725  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3726  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3727  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3728  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3729  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3730  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3731  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3732  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3733  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3734  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3735  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3736  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3737  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3738  Info.opc = getOpcForTextureInstr(Intrinsic);
3739  Info.memVT = MVT::v4i32;
3740  Info.ptrVal = nullptr;
3741  Info.offset = 0;
3743  Info.align = 16;
3744  return true;
3745 
3746  case Intrinsic::nvvm_suld_1d_i8_clamp:
3747  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3748  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3749  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3750  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3751  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3752  case Intrinsic::nvvm_suld_2d_i8_clamp:
3753  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3754  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3755  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3756  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3757  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3758  case Intrinsic::nvvm_suld_3d_i8_clamp:
3759  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3760  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3761  case Intrinsic::nvvm_suld_1d_i8_trap:
3762  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3763  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3764  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3765  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3766  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3767  case Intrinsic::nvvm_suld_2d_i8_trap:
3768  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3769  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3770  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3771  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3772  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3773  case Intrinsic::nvvm_suld_3d_i8_trap:
3774  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3775  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3776  case Intrinsic::nvvm_suld_1d_i8_zero:
3777  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3778  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3779  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3780  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3781  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3782  case Intrinsic::nvvm_suld_2d_i8_zero:
3783  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3784  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3785  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3786  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3787  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3788  case Intrinsic::nvvm_suld_3d_i8_zero:
3789  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3790  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3791  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3792  Info.memVT = MVT::i8;
3793  Info.ptrVal = nullptr;
3794  Info.offset = 0;
3796  Info.align = 16;
3797  return true;
3798 
3799  case Intrinsic::nvvm_suld_1d_i16_clamp:
3800  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3801  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3802  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3803  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3804  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3805  case Intrinsic::nvvm_suld_2d_i16_clamp:
3806  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3807  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3808  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3809  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3810  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3811  case Intrinsic::nvvm_suld_3d_i16_clamp:
3812  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3813  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3814  case Intrinsic::nvvm_suld_1d_i16_trap:
3815  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3816  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3817  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3818  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3819  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3820  case Intrinsic::nvvm_suld_2d_i16_trap:
3821  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3822  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3823  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3824  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3825  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3826  case Intrinsic::nvvm_suld_3d_i16_trap:
3827  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3828  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3829  case Intrinsic::nvvm_suld_1d_i16_zero:
3830  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3831  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3832  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3833  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3834  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3835  case Intrinsic::nvvm_suld_2d_i16_zero:
3836  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3837  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3838  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3839  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3840  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3841  case Intrinsic::nvvm_suld_3d_i16_zero:
3842  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3843  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3844  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3845  Info.memVT = MVT::i16;
3846  Info.ptrVal = nullptr;
3847  Info.offset = 0;
3849  Info.align = 16;
3850  return true;
3851 
3852  case Intrinsic::nvvm_suld_1d_i32_clamp:
3853  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3854  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3855  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3856  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3857  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3858  case Intrinsic::nvvm_suld_2d_i32_clamp:
3859  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3860  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3861  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3862  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3863  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3864  case Intrinsic::nvvm_suld_3d_i32_clamp:
3865  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3866  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3867  case Intrinsic::nvvm_suld_1d_i32_trap:
3868  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3869  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3870  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3871  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3872  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3873  case Intrinsic::nvvm_suld_2d_i32_trap:
3874  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3875  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3876  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3877  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3878  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3879  case Intrinsic::nvvm_suld_3d_i32_trap:
3880  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3881  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3882  case Intrinsic::nvvm_suld_1d_i32_zero:
3883  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3884  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3885  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3886  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3887  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3888  case Intrinsic::nvvm_suld_2d_i32_zero:
3889  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3890  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3891  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3892  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3893  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3894  case Intrinsic::nvvm_suld_3d_i32_zero:
3895  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3896  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3897  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3898  Info.memVT = MVT::i32;
3899  Info.ptrVal = nullptr;
3900  Info.offset = 0;
3902  Info.align = 16;
3903  return true;
3904 
3905  case Intrinsic::nvvm_suld_1d_i64_clamp:
3906  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3907  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3908  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3909  case Intrinsic::nvvm_suld_2d_i64_clamp:
3910  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3911  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3912  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3913  case Intrinsic::nvvm_suld_3d_i64_clamp:
3914  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3915  case Intrinsic::nvvm_suld_1d_i64_trap:
3916  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3917  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3918  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3919  case Intrinsic::nvvm_suld_2d_i64_trap:
3920  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3921  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3922  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3923  case Intrinsic::nvvm_suld_3d_i64_trap:
3924  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3925  case Intrinsic::nvvm_suld_1d_i64_zero:
3926  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3927  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3928  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3929  case Intrinsic::nvvm_suld_2d_i64_zero:
3930  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3931  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3932  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3933  case Intrinsic::nvvm_suld_3d_i64_zero:
3934  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3935  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3936  Info.memVT = MVT::i64;
3937  Info.ptrVal = nullptr;
3938  Info.offset = 0;
3940  Info.align = 16;
3941  return true;
3942  }
3943  return false;
3944 }
3945 
3946 /// isLegalAddressingMode - Return true if the addressing mode represented
3947 /// by AM is legal for this target, for a load/store of the specified type.
3948 /// Used to guide target specific optimizations, like loop strength reduction
3949 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
3950 /// (CodeGenPrepare.cpp)
3952  const AddrMode &AM, Type *Ty,
3953  unsigned AS, Instruction *I) const {
3954  // AddrMode - This represents an addressing mode of:
3955  // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
3956  //
3957  // The legal address modes are
3958  // - [avar]
3959  // - [areg]
3960  // - [areg+immoff]
3961  // - [immAddr]
3962 
3963  if (AM.BaseGV) {
3964  return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
3965  }
3966 
3967  switch (AM.Scale) {
3968  case 0: // "r", "r+i" or "i" is allowed
3969  break;
3970  case 1:
3971  if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
3972  return false;
3973  // Otherwise we have r+i.
3974  break;
3975  default:
3976  // No scale > 1 is allowed
3977  return false;
3978  }
3979  return true;
3980 }
3981 
3982 //===----------------------------------------------------------------------===//
3983 // NVPTX Inline Assembly Support
3984 //===----------------------------------------------------------------------===//
3985 
3986 /// getConstraintType - Given a constraint letter, return the type of
3987 /// constraint it is for this target.
3990  if (Constraint.size() == 1) {
3991  switch (Constraint[0]) {
3992  default:
3993  break;
3994  case 'b':
3995  case 'r':
3996  case 'h':
3997  case 'c':
3998  case 'l':
3999  case 'f':
4000  case 'd':
4001  case '0':
4002  case 'N':
4003  return C_RegisterClass;
4004  }
4005  }
4006  return TargetLowering::getConstraintType(Constraint);
4007 }
4008 
4009 std::pair<unsigned, const TargetRegisterClass *>
4011  StringRef Constraint,
4012  MVT VT) const {
4013  if (Constraint.size() == 1) {
4014  switch (Constraint[0]) {
4015  case 'b':
4016  return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4017  case 'c':
4018  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4019  case 'h':
4020  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4021  case 'r':
4022  return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4023  case 'l':
4024  case 'N':
4025  return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4026  case 'f':
4027  return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4028  case 'd':
4029  return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4030  }
4031  }
4032  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4033 }
4034 
4035 //===----------------------------------------------------------------------===//
4036 // NVPTX DAG Combining
4037 //===----------------------------------------------------------------------===//
4038 
4040  CodeGenOpt::Level OptLevel) const {
4041  // Always honor command-line argument
4042  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4043  return FMAContractLevelOpt > 0;
4044 
4045  // Do not contract if we're not optimizing the code.
4046  if (OptLevel == 0)
4047  return false;
4048 
4049  // Honor TargetOptions flags that explicitly say fusion is okay.
4051  return true;
4052 
4053  return allowUnsafeFPMath(MF);
4054 }
4055 
4057  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4058  if (MF.getTarget().Options.UnsafeFPMath)
4059  return true;
4060 
4061  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4062  const Function &F = MF.getFunction();
4063  if (F.hasFnAttribute("unsafe-fp-math")) {
4064  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4065  StringRef Val = Attr.getValueAsString();
4066  if (Val == "true")
4067  return true;
4068  }
4069 
4070  return false;
4071 }
4072 
4073 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4074 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4075 /// called with the default operands, and if that fails, with commuted
4076 /// operands.
4079  const NVPTXSubtarget &Subtarget,
4080  CodeGenOpt::Level OptLevel) {
4081  SelectionDAG &DAG = DCI.DAG;
4082  // Skip non-integer, non-scalar case
4083  EVT VT=N0.getValueType();
4084  if (VT.isVector())
4085  return SDValue();
4086 
4087  // fold (add (mul a, b), c) -> (mad a, b, c)
4088  //
4089  if (N0.getOpcode() == ISD::MUL) {
4090  assert (VT.isInteger());
4091  // For integer:
4092  // Since integer multiply-add costs the same as integer multiply
4093  // but is more costly than integer add, do the fusion only when
4094  // the mul is only used in the add.
4095  if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4096  !N0.getNode()->hasOneUse())
4097  return SDValue();
4098 
4099  // Do the folding
4100  return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4101  N0.getOperand(0), N0.getOperand(1), N1);
4102  }
4103  else if (N0.getOpcode() == ISD::FMUL) {
4104  if (VT == MVT::f32 || VT == MVT::f64) {
4105  const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4106  &DAG.getTargetLoweringInfo());
4107  if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4108  return SDValue();
4109 
4110  // For floating point:
4111  // Do the fusion only when the mul has less than 5 uses and all
4112  // are add.
4113  // The heuristic is that if a use is not an add, then that use
4114  // cannot be fused into fma, therefore mul is still needed anyway.
4115  // If there are more than 4 uses, even if they are all add, fusing
4116  // them will increase register pressue.
4117  //
4118  int numUses = 0;
4119  int nonAddCount = 0;
4120  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4121  UE = N0.getNode()->use_end();
4122  UI != UE; ++UI) {
4123  numUses++;
4124  SDNode *User = *UI;
4125  if (User->getOpcode() != ISD::FADD)
4126  ++nonAddCount;
4127  }
4128  if (numUses >= 5)
4129  return SDValue();
4130  if (nonAddCount) {
4131  int orderNo = N->getIROrder();
4132  int orderNo2 = N0.getNode()->getIROrder();
4133  // simple heuristics here for considering potential register
4134  // pressure, the logics here is that the differnce are used
4135  // to measure the distance between def and use, the longer distance
4136  // more likely cause register pressure.
4137  if (orderNo - orderNo2 < 500)
4138  return SDValue();
4139 
4140  // Now, check if at least one of the FMUL's operands is live beyond the node N,
4141  // which guarantees that the FMA will not increase register pressure at node N.
4142  bool opIsLive = false;
4143  const SDNode *left = N0.getOperand(0).getNode();
4144  const SDNode *right = N0.getOperand(1).getNode();
4145 
4146  if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4147  opIsLive = true;
4148 
4149  if (!opIsLive)
4150  for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4151  SDNode *User = *UI;
4152  int orderNo3 = User->getIROrder();
4153  if (orderNo3 > orderNo) {
4154  opIsLive = true;
4155  break;
4156  }
4157  }
4158 
4159  if (!opIsLive)
4160  for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4161  SDNode *User = *UI;
4162  int orderNo3 = User->getIROrder();
4163  if (orderNo3 > orderNo) {
4164  opIsLive = true;
4165  break;
4166  }
4167  }
4168 
4169  if (!opIsLive)
4170  return SDValue();
4171  }
4172 
4173  return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4174  N0.getOperand(0), N0.getOperand(1), N1);
4175  }
4176  }
4177 
4178  return SDValue();
4179 }
4180 
4181 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4182 ///
4185  const NVPTXSubtarget &Subtarget,
4186  CodeGenOpt::Level OptLevel) {
4187  SDValue N0 = N->getOperand(0);
4188  SDValue N1 = N->getOperand(1);
4189 
4190  // First try with the default operand order.
4191  if (SDValue Result =
4192  PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4193  return Result;
4194 
4195  // If that didn't work, try again with the operands commuted.
4196  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4197 }
4198 
4201  // The type legalizer turns a vector load of i8 values into a zextload to i16
4202  // registers, optionally ANY_EXTENDs it (if target type is integer),
4203  // and ANDs off the high 8 bits. Since we turn this load into a
4204  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4205  // nodes. Do that here.
4206  SDValue Val = N->getOperand(0);
4207  SDValue Mask = N->getOperand(1);
4208 
4209  if (isa<ConstantSDNode>(Val)) {
4210  std::swap(Val, Mask);
4211  }
4212 
4213  SDValue AExt;
4214  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4215  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4216  AExt = Val;
4217  Val = Val->getOperand(0);
4218  }
4219 
4220  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4221  Val = Val->getOperand(0);
4222  }
4223 
4224  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4225  Val->getOpcode() == NVPTXISD::LoadV4) {
4227  if (!MaskCnst) {
4228  // Not an AND with a constant
4229  return SDValue();
4230  }
4231 
4232  uint64_t MaskVal = MaskCnst->getZExtValue();
4233  if (MaskVal != 0xff) {
4234  // Not an AND that chops off top 8 bits
4235  return SDValue();
4236  }
4237 
4238  MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4239  if (!Mem) {
4240  // Not a MemSDNode?!?
4241  return SDValue();
4242  }
4243 
4244  EVT MemVT = Mem->getMemoryVT();
4245  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4246  // We only handle the i8 case
4247  return SDValue();
4248  }
4249 
4250  unsigned ExtType =
4251  cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4252  getZExtValue();
4253  if (ExtType == ISD::SEXTLOAD) {
4254  // If for some reason the load is a sextload, the and is needed to zero
4255  // out the high 8 bits
4256  return SDValue();
4257  }
4258 
4259  bool AddTo = false;
4260  if (AExt.getNode() != nullptr) {
4261  // Re-insert the ext as a zext.
4262  Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4263  AExt.getValueType(), Val);
4264  AddTo = true;
4265  }
4266 
4267  // If we get here, the AND is unnecessary. Just replace it with the load
4268  DCI.CombineTo(N, Val, AddTo);
4269  }
4270 
4271  return SDValue();
4272 }
4273 
4276  CodeGenOpt::Level OptLevel) {
4277  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4278 
4279  // Don't do anything at less than -O2.
4280  if (OptLevel < CodeGenOpt::Default)
4281  return SDValue();
4282 
4283  SelectionDAG &DAG = DCI.DAG;
4284  SDLoc DL(N);
4285  EVT VT = N->getValueType(0);
4286  bool IsSigned = N->getOpcode() == ISD::SREM;
4287  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4288 
4289  const SDValue &Num = N->getOperand(0);
4290  const SDValue &Den = N->getOperand(1);
4291 
4292  for (const SDNode *U : Num->uses()) {
4293  if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4294  U->getOperand(1) == Den) {
4295  // Num % Den -> Num - (Num / Den) * Den
4296  return DAG.getNode(ISD::SUB, DL, VT, Num,
4297  DAG.getNode(ISD::MUL, DL, VT,
4298  DAG.getNode(DivOpc, DL, VT, Num, Den),
4299  Den));
4300  }
4301  }
4302  return SDValue();
4303 }
4304 
4306  Signed = 0,
4309 };
4310 
4311 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4312 /// that can be demoted to \p OptSize bits without loss of information. The
4313 /// signedness of the operand, if determinable, is placed in \p S.
4315  unsigned OptSize,
4316  OperandSignedness &S) {
4317  S = Unknown;
4318 
4319  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4321  EVT OrigVT = Op.getOperand(0).getValueType();
4322  if (OrigVT.getSizeInBits() <= OptSize) {
4323  S = Signed;
4324  return true;
4325  }
4326  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4327  EVT OrigVT = Op.getOperand(0).getValueType();
4328  if (OrigVT.getSizeInBits() <= OptSize) {
4329  S = Unsigned;
4330  return true;
4331  }
4332  }
4333 
4334  return false;
4335 }
4336 
4337 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4338 /// be demoted to \p OptSize bits without loss of information. If the operands
4339 /// contain a constant, it should appear as the RHS operand. The signedness of
4340 /// the operands is placed in \p IsSigned.
4342  unsigned OptSize,
4343  bool &IsSigned) {
4344  OperandSignedness LHSSign;
4345 
4346  // The LHS operand must be a demotable op
4347  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4348  return false;
4349 
4350  // We should have been able to determine the signedness from the LHS
4351  if (LHSSign == Unknown)
4352  return false;
4353 
4354  IsSigned = (LHSSign == Signed);
4355 
4356  // The RHS can be a demotable op or a constant
4357  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4358  const APInt &Val = CI->getAPIntValue();
4359  if (LHSSign == Unsigned) {
4360  return Val.isIntN(OptSize);
4361  } else {
4362  return Val.isSignedIntN(OptSize);
4363  }
4364  } else {
4365  OperandSignedness RHSSign;
4366  if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4367  return false;
4368 
4369  return LHSSign == RHSSign;
4370  }
4371 }
4372 
4373 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4374 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4375 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4376 /// amount.
4379  EVT MulType = N->getValueType(0);
4380  if (MulType != MVT::i32 && MulType != MVT::i64) {
4381  return SDValue();
4382  }
4383 
4384  SDLoc DL(N);
4385  unsigned OptSize = MulType.getSizeInBits() >> 1;
4386  SDValue LHS = N->getOperand(0);
4387  SDValue RHS = N->getOperand(1);
4388 
4389  // Canonicalize the multiply so the constant (if any) is on the right
4390  if (N->getOpcode() == ISD::MUL) {
4391  if (isa<ConstantSDNode>(LHS)) {
4392  std::swap(LHS, RHS);
4393  }
4394  }
4395 
4396  // If we have a SHL, determine the actual multiply amount
4397  if (N->getOpcode() == ISD::SHL) {
4398  ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4399  if (!ShlRHS) {
4400  return SDValue();
4401  }
4402 
4403  APInt ShiftAmt = ShlRHS->getAPIntValue();
4404  unsigned BitWidth = MulType.getSizeInBits();
4405  if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4406  APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4407  RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4408  } else {
4409  return SDValue();
4410  }
4411  }
4412 
4413  bool Signed;
4414  // Verify that our operands are demotable
4415  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4416  return SDValue();
4417  }
4418 
4419  EVT DemotedVT;
4420  if (MulType == MVT::i32) {
4421  DemotedVT = MVT::i16;
4422  } else {
4423  DemotedVT = MVT::i32;
4424  }
4425 
4426  // Truncate the operands to the correct size. Note that these are just for
4427  // type consistency and will (likely) be eliminated in later phases.
4428  SDValue TruncLHS =
4429  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4430  SDValue TruncRHS =
4431  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4432 
4433  unsigned Opc;
4434  if (Signed) {
4436  } else {
4438  }
4439 
4440  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4441 }
4442 
4443 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4446  CodeGenOpt::Level OptLevel) {
4447  if (OptLevel > 0) {
4448  // Try mul.wide combining at OptLevel > 0
4449  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4450  return Ret;
4451  }
4452 
4453  return SDValue();
4454 }
4455 
4456 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4459  CodeGenOpt::Level OptLevel) {
4460  if (OptLevel > 0) {
4461  // Try mul.wide combining at OptLevel > 0
4462  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4463  return Ret;
4464  }
4465 
4466  return SDValue();
4467 }
4468 
4471  EVT CCType = N->getValueType(0);
4472  SDValue A = N->getOperand(0);
4473  SDValue B = N->getOperand(1);
4474 
4475  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
4476  return SDValue();
4477 
4478  SDLoc DL(N);
4479  // setp.f16x2 returns two scalar predicates, which we need to
4480  // convert back to v2i1. The returned result will be scalarized by
4481  // the legalizer, but the comparison will remain a single vector
4482  // instruction.
4483  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
4484  DCI.DAG.getVTList(MVT::i1, MVT::i1),
4485  {A, B, N->getOperand(2)});
4486  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
4487  CCNode.getValue(1));
4488 }
4489 
4490 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4491  DAGCombinerInfo &DCI) const {
4493  switch (N->getOpcode()) {
4494  default: break;
4495  case ISD::ADD:
4496  case ISD::FADD:
4497  return PerformADDCombine(N, DCI, STI, OptLevel);
4498  case ISD::MUL:
4499  return PerformMULCombine(N, DCI, OptLevel);
4500  case ISD::SHL:
4501  return PerformSHLCombine(N, DCI, OptLevel);
4502  case ISD::AND:
4503  return PerformANDCombine(N, DCI);
4504  case ISD::UREM:
4505  case ISD::SREM:
4506  return PerformREMCombine(N, DCI, OptLevel);
4507  case ISD::SETCC:
4508  return PerformSETCCCombine(N, DCI);
4509  }
4510  return SDValue();
4511 }
4512 
4513 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4516  EVT ResVT = N->getValueType(0);
4517  SDLoc DL(N);
4518 
4519  assert(ResVT.isVector() && "Vector load must have vector type");
4520 
4521  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4522  // legal. We can (and should) split that into 2 loads of <2 x double> here
4523  // but I'm leaving that as a TODO for now.
4524  assert(ResVT.isSimple() && "Can only handle simple types");
4525  switch (ResVT.getSimpleVT().SimpleTy) {
4526  default:
4527  return;
4528  case MVT::v2i8:
4529  case MVT::v2i16:
4530  case MVT::v2i32:
4531  case MVT::v2i64:
4532  case MVT::v2f16:
4533  case MVT::v2f32:
4534  case MVT::v2f64:
4535  case MVT::v4i8:
4536  case MVT::v4i16:
4537  case MVT::v4i32:
4538  case MVT::v4f16:
4539  case MVT::v4f32:
4540  case MVT::v8f16: // <4 x f16x2>
4541  // This is a "native" vector type
4542  break;
4543  }
4544 
4545  LoadSDNode *LD = cast<LoadSDNode>(N);
4546 
4547  unsigned Align = LD->getAlignment();
4548  auto &TD = DAG.getDataLayout();
4549  unsigned PrefAlign =
4550  TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4551  if (Align < PrefAlign) {
4552  // This load is not sufficiently aligned, so bail out and let this vector
4553  // load be scalarized. Note that we may still be able to emit smaller
4554  // vector loads. For example, if we are loading a <4 x float> with an
4555  // alignment of 8, this check will fail but the legalizer will try again
4556  // with 2 x <2 x float>, which will succeed with an alignment of 8.
4557  return;
4558  }
4559 
4560  EVT EltVT = ResVT.getVectorElementType();
4561  unsigned NumElts = ResVT.getVectorNumElements();
4562 
4563  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4564  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4565  // loaded type to i16 and propagate the "real" type as the memory type.
4566  bool NeedTrunc = false;
4567  if (EltVT.getSizeInBits() < 16) {
4568  EltVT = MVT::i16;
4569  NeedTrunc = true;
4570  }
4571 
4572  unsigned Opcode = 0;
4573  SDVTList LdResVTs;
4574  bool LoadF16x2 = false;
4575 
4576  switch (NumElts) {
4577  default:
4578  return;
4579  case 2:
4580  Opcode = NVPTXISD::LoadV2;
4581  LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4582  break;
4583  case 4: {
4584  Opcode = NVPTXISD::LoadV4;
4585  EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4586  LdResVTs = DAG.getVTList(ListVTs);
4587  break;
4588  }
4589  case 8: {
4590  // v8f16 is a special case. PTX doesn't have ld.v8.f16
4591  // instruction. Instead, we split the vector into v2f16 chunks and
4592  // load them with ld.v4.b32.
4593  assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
4594  LoadF16x2 = true;
4595  Opcode = NVPTXISD::LoadV4;
4596  EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
4597  MVT::Other};
4598  LdResVTs = DAG.getVTList(ListVTs);
4599  break;
4600  }
4601  }
4602 
4603  // Copy regular operands
4604  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4605 
4606  // The select routine does not have access to the LoadSDNode instance, so
4607  // pass along the extension information
4608  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4609 
4610  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4611  LD->getMemoryVT(),
4612  LD->getMemOperand());
4613 
4614  SmallVector<SDValue, 8> ScalarRes;
4615  if (LoadF16x2) {
4616  // Split v2f16 subvectors back into individual elements.
4617  NumElts /= 2;
4618  for (unsigned i = 0; i < NumElts; ++i) {
4619  SDValue SubVector = NewLD.getValue(i);
4620  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4621  DAG.getIntPtrConstant(0, DL));
4622  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4623  DAG.getIntPtrConstant(1, DL));
4624  ScalarRes.push_back(E0);
4625  ScalarRes.push_back(E1);
4626  }
4627  } else {
4628  for (unsigned i = 0; i < NumElts; ++i) {
4629  SDValue Res = NewLD.getValue(i);
4630  if (NeedTrunc)
4631  Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4632  ScalarRes.push_back(Res);
4633  }
4634  }
4635 
4636  SDValue LoadChain = NewLD.getValue(NumElts);
4637 
4638  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
4639 
4640  Results.push_back(BuildVec);
4641  Results.push_back(LoadChain);
4642 }
4643 
4646  SDValue Chain = N->getOperand(0);
4647  SDValue Intrin = N->getOperand(1);
4648  SDLoc DL(N);
4649 
4650  // Get the intrinsic ID
4651  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4652  switch (IntrinNo) {
4653  default:
4654  return;
4655  case Intrinsic::nvvm_ldg_global_i:
4656  case Intrinsic::nvvm_ldg_global_f:
4657  case Intrinsic::nvvm_ldg_global_p:
4658