LLVM  9.0.0svn
NVPTXISelLowering.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/Analysis.h"
32 #include "llvm/IR/Argument.h"
33 #include "llvm/IR/Attributes.h"
34 #include "llvm/IR/CallSite.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/GlobalValue.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Support/CodeGen.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <iterator>
58 #include <sstream>
59 #include <string>
60 #include <utility>
61 #include <vector>
62 
63 #define DEBUG_TYPE "nvptx-lower"
64 
65 using namespace llvm;
66 
67 static unsigned int uniqueCallSite = 0;
68 
70  "nvptx-sched4reg",
71  cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
72 
73 static cl::opt<unsigned>
75  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
76  " 1: do it 2: do it aggressively"),
77  cl::init(2));
78 
80  "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
81  cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
82  " IEEE Compliant F32 div.rnd if available."),
83  cl::init(2));
84 
86  "nvptx-prec-sqrtf32", cl::Hidden,
87  cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
88  cl::init(true));
89 
91  "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
92  cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
93  cl::init(false));
94 
96  if (UsePrecDivF32.getNumOccurrences() > 0) {
97  // If nvptx-prec-div32=N is used on the command-line, always honor it
98  return UsePrecDivF32;
99  } else {
100  // Otherwise, use div.approx if fast math is enabled
102  return 0;
103  else
104  return 2;
105  }
106 }
107 
109  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
110  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
111  return UsePrecSqrtF32;
112  } else {
113  // Otherwise, use sqrt.approx if fast math is enabled
115  }
116 }
117 
119  // TODO: Get rid of this flag; there can be only one way to do this.
120  if (FtzEnabled.getNumOccurrences() > 0) {
121  // If nvptx-f32ftz is used on the command-line, always honor it
122  return FtzEnabled;
123  } else {
124  const Function &F = MF.getFunction();
125  // Otherwise, check for an nvptx-f32ftz attribute on the function
126  if (F.hasFnAttribute("nvptx-f32ftz"))
127  return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
128  else
129  return false;
130  }
131 }
132 
133 static bool IsPTXVectorType(MVT VT) {
134  switch (VT.SimpleTy) {
135  default:
136  return false;
137  case MVT::v2i1:
138  case MVT::v4i1:
139  case MVT::v2i8:
140  case MVT::v4i8:
141  case MVT::v2i16:
142  case MVT::v4i16:
143  case MVT::v2i32:
144  case MVT::v4i32:
145  case MVT::v2i64:
146  case MVT::v2f16:
147  case MVT::v4f16:
148  case MVT::v8f16: // <4 x f16x2>
149  case MVT::v2f32:
150  case MVT::v4f32:
151  case MVT::v2f64:
152  return true;
153  }
154 }
155 
156 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
157 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
158 /// into their primitive components.
159 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
160 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
161 /// LowerCall, and LowerReturn.
162 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
163  Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
165  uint64_t StartingOffset = 0) {
166  SmallVector<EVT, 16> TempVTs;
167  SmallVector<uint64_t, 16> TempOffsets;
168 
169  // Special case for i128 - decompose to (i64, i64)
170  if (Ty->isIntegerTy(128)) {
171  ValueVTs.push_back(EVT(MVT::i64));
172  ValueVTs.push_back(EVT(MVT::i64));
173 
174  if (Offsets) {
175  Offsets->push_back(StartingOffset + 0);
176  Offsets->push_back(StartingOffset + 8);
177  }
178 
179  return;
180  }
181 
182  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
183  if (StructType *STy = dyn_cast<StructType>(Ty)) {
184  auto const *SL = DL.getStructLayout(STy);
185  auto ElementNum = 0;
186  for(auto *EI : STy->elements()) {
187  ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
188  StartingOffset + SL->getElementOffset(ElementNum));
189  ++ElementNum;
190  }
191  return;
192  }
193 
194  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
195  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
196  EVT VT = TempVTs[i];
197  uint64_t Off = TempOffsets[i];
198  // Split vectors into individual elements, except for v2f16, which
199  // we will pass as a single scalar.
200  if (VT.isVector()) {
201  unsigned NumElts = VT.getVectorNumElements();
202  EVT EltVT = VT.getVectorElementType();
203  // Vectors with an even number of f16 elements will be passed to
204  // us as an array of v2f16 elements. We must match this so we
205  // stay in sync with Ins/Outs.
206  if (EltVT == MVT::f16 && NumElts % 2 == 0) {
207  EltVT = MVT::v2f16;
208  NumElts /= 2;
209  }
210  for (unsigned j = 0; j != NumElts; ++j) {
211  ValueVTs.push_back(EltVT);
212  if (Offsets)
213  Offsets->push_back(Off + j * EltVT.getStoreSize());
214  }
215  } else {
216  ValueVTs.push_back(VT);
217  if (Offsets)
218  Offsets->push_back(Off);
219  }
220  }
221 }
222 
223 // Check whether we can merge loads/stores of some of the pieces of a
224 // flattened function parameter or return value into a single vector
225 // load/store.
226 //
227 // The flattened parameter is represented as a list of EVTs and
228 // offsets, and the whole structure is aligned to ParamAlignment. This
229 // function determines whether we can load/store pieces of the
230 // parameter starting at index Idx using a single vectorized op of
231 // size AccessSize. If so, it returns the number of param pieces
232 // covered by the vector op. Otherwise, it returns 1.
234  unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
235  const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
236  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
237 
238  // Can't vectorize if param alignment is not sufficient.
239  if (AccessSize > ParamAlignment)
240  return 1;
241  // Can't vectorize if offset is not aligned.
242  if (Offsets[Idx] & (AccessSize - 1))
243  return 1;
244 
245  EVT EltVT = ValueVTs[Idx];
246  unsigned EltSize = EltVT.getStoreSize();
247 
248  // Element is too large to vectorize.
249  if (EltSize >= AccessSize)
250  return 1;
251 
252  unsigned NumElts = AccessSize / EltSize;
253  // Can't vectorize if AccessBytes if not a multiple of EltSize.
254  if (AccessSize != EltSize * NumElts)
255  return 1;
256 
257  // We don't have enough elements to vectorize.
258  if (Idx + NumElts > ValueVTs.size())
259  return 1;
260 
261  // PTX ISA can only deal with 2- and 4-element vector ops.
262  if (NumElts != 4 && NumElts != 2)
263  return 1;
264 
265  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
266  // Types do not match.
267  if (ValueVTs[j] != EltVT)
268  return 1;
269 
270  // Elements are not contiguous.
271  if (Offsets[j] - Offsets[j - 1] != EltSize)
272  return 1;
273  }
274  // OK. We can vectorize ValueVTs[i..i+NumElts)
275  return NumElts;
276 }
277 
278 // Flags for tracking per-element vectorization state of loads/stores
279 // of a flattened function parameter or return value.
281  PVF_INNER = 0x0, // Middle elements of a vector.
282  PVF_FIRST = 0x1, // First element of the vector.
283  PVF_LAST = 0x2, // Last element of the vector.
284  // Scalar is effectively a 1-element vector.
286 };
287 
288 // Computes whether and how we can vectorize the loads/stores of a
289 // flattened function parameter or return value.
290 //
291 // The flattened parameter is represented as the list of ValueVTs and
292 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
293 // of the same size as ValueVTs indicating how each piece should be
294 // loaded/stored (i.e. as a scalar, or as part of a vector
295 // load/store).
299  unsigned ParamAlignment) {
300  // Set vector size to match ValueVTs and mark all elements as
301  // scalars by default.
303  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
304 
305  // Check what we can vectorize using 128/64/32-bit accesses.
306  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
307  // Skip elements we've already processed.
308  assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
309  for (unsigned AccessSize : {16, 8, 4, 2}) {
310  unsigned NumElts = CanMergeParamLoadStoresStartingAt(
311  I, AccessSize, ValueVTs, Offsets, ParamAlignment);
312  // Mark vectorized elements.
313  switch (NumElts) {
314  default:
315  llvm_unreachable("Unexpected return value");
316  case 1:
317  // Can't vectorize using this size, try next smaller size.
318  continue;
319  case 2:
320  assert(I + 1 < E && "Not enough elements.");
321  VectorInfo[I] = PVF_FIRST;
322  VectorInfo[I + 1] = PVF_LAST;
323  I += 1;
324  break;
325  case 4:
326  assert(I + 3 < E && "Not enough elements.");
327  VectorInfo[I] = PVF_FIRST;
328  VectorInfo[I + 1] = PVF_INNER;
329  VectorInfo[I + 2] = PVF_INNER;
330  VectorInfo[I + 3] = PVF_LAST;
331  I += 3;
332  break;
333  }
334  // Break out of the inner loop because we've already succeeded
335  // using largest possible AccessSize.
336  break;
337  }
338  }
339  return VectorInfo;
340 }
341 
342 // NVPTXTargetLowering Constructor.
344  const NVPTXSubtarget &STI)
345  : TargetLowering(TM), nvTM(&TM), STI(STI) {
346  // always lower memset, memcpy, and memmove intrinsics to load/store
347  // instructions, rather
348  // then generating calls to memset, mempcy or memmove.
349  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
350  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
351  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
352 
355 
356  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
357  // condition branches.
358  setJumpIsExpensive(true);
359 
360  // Wide divides are _very_ slow. Try to reduce the width of the divide if
361  // possible.
362  addBypassSlowDiv(64, 32);
363 
364  // By default, use the Source scheduling
365  if (sched4reg)
367  else
369 
370  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
371  LegalizeAction NoF16Action) {
372  setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
373  };
374 
375  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
376  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
377  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
378  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
379  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
380  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
381  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
382  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
383 
384  // Conversion to/from FP16/FP16x2 is always legal.
391 
392  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
393  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
394 
395  // Operations not directly supported by NVPTX.
400  }
401 
402  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
403  // For others we will expand to a SHL/SRA pair.
409 
416 
419 
420  // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
421  // that don't have h/w rotation we lower them to multi-instruction assembly.
422  // See ROT*_sw in NVPTXIntrInfo.td
427 
435 
436  // Indirect branch is not supported.
437  // This also disables Jump Table creation.
440 
443 
444  // We want to legalize constant related memmove and memcopy
445  // intrinsics.
447 
448  // Turn FP extload into load/fpextend
458  // Turn FP truncstore into trunc + store.
459  // FIXME: vector types should also be expanded
463 
464  // PTX does not support load / store predicate registers
467 
468  for (MVT VT : MVT::integer_valuetypes()) {
472  }
473 
474  // This is legal in NVPTX
478 
479  // TRAP can be lowered to PTX trap
481 
482  // Register custom handling for vector loads/stores
483  for (MVT VT : MVT::vector_valuetypes()) {
484  if (IsPTXVectorType(VT)) {
488  }
489  }
490 
491  // Custom handling for i8 intrinsics
493 
494  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
500 
503  }
504 
508 
509  // PTX does not directly support SELP of i1, so promote to i32 first
511 
512  // PTX cannot multiply two i64s in a single instruction.
515 
516  // We have some custom DAG combine patterns for these nodes
524 
525  // setcc for f16x2 needs special handling to prevent legalizer's
526  // attempt to scalarize it due to v2i1 not being legal.
527  if (STI.allowFP16Math())
529 
530  // Promote fp16 arithmetic if fp16 hardware isn't available or the
531  // user passed --nvptx-no-fp16-math. The flag is useful because,
532  // although sm_53+ GPUs have some sort of FP16 support in
533  // hardware, only sm_53 and sm_60 have full implementation. Others
534  // only have token amount of hardware and are likely to run faster
535  // by using fp32 units instead.
536  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
537  setFP16OperationAction(Op, MVT::f16, Legal, Promote);
538  setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
539  }
540 
541  // There's no neg.f16 instruction. Expand to (0-x).
544 
545  // (would be) Library functions.
546 
547  // These map to conversion instructions for scalar FP types.
548  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
554  }
555 
556  // 'Expand' implements FCOPYSIGN without calling an external library.
561 
562  // These map to corresponding instructions for f32/f64. f16 must be
563  // promoted to f32. v2f16 is expanded to f16, which is then promoted
564  // to f32.
565  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
571  }
576 
577  // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
578  // No FPOW or FREM in PTX.
579 
580  // Now deduce the information based on the above mentioned
581  // actions
583 }
584 
585 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
586  switch ((NVPTXISD::NodeType)Opcode) {
588  break;
589  case NVPTXISD::CALL:
590  return "NVPTXISD::CALL";
591  case NVPTXISD::RET_FLAG:
592  return "NVPTXISD::RET_FLAG";
594  return "NVPTXISD::LOAD_PARAM";
595  case NVPTXISD::Wrapper:
596  return "NVPTXISD::Wrapper";
598  return "NVPTXISD::DeclareParam";
600  return "NVPTXISD::DeclareScalarParam";
602  return "NVPTXISD::DeclareRet";
604  return "NVPTXISD::DeclareScalarRet";
606  return "NVPTXISD::DeclareRetParam";
607  case NVPTXISD::PrintCall:
608  return "NVPTXISD::PrintCall";
610  return "NVPTXISD::PrintConvergentCall";
612  return "NVPTXISD::PrintCallUni";
614  return "NVPTXISD::PrintConvergentCallUni";
615  case NVPTXISD::LoadParam:
616  return "NVPTXISD::LoadParam";
618  return "NVPTXISD::LoadParamV2";
620  return "NVPTXISD::LoadParamV4";
622  return "NVPTXISD::StoreParam";
624  return "NVPTXISD::StoreParamV2";
626  return "NVPTXISD::StoreParamV4";
628  return "NVPTXISD::StoreParamS32";
630  return "NVPTXISD::StoreParamU32";
632  return "NVPTXISD::CallArgBegin";
633  case NVPTXISD::CallArg:
634  return "NVPTXISD::CallArg";
636  return "NVPTXISD::LastCallArg";
638  return "NVPTXISD::CallArgEnd";
639  case NVPTXISD::CallVoid:
640  return "NVPTXISD::CallVoid";
641  case NVPTXISD::CallVal:
642  return "NVPTXISD::CallVal";
644  return "NVPTXISD::CallSymbol";
645  case NVPTXISD::Prototype:
646  return "NVPTXISD::Prototype";
647  case NVPTXISD::MoveParam:
648  return "NVPTXISD::MoveParam";
650  return "NVPTXISD::StoreRetval";
652  return "NVPTXISD::StoreRetvalV2";
654  return "NVPTXISD::StoreRetvalV4";
656  return "NVPTXISD::PseudoUseParam";
657  case NVPTXISD::RETURN:
658  return "NVPTXISD::RETURN";
660  return "NVPTXISD::CallSeqBegin";
662  return "NVPTXISD::CallSeqEnd";
664  return "NVPTXISD::CallPrototype";
665  case NVPTXISD::ProxyReg:
666  return "NVPTXISD::ProxyReg";
667  case NVPTXISD::LoadV2:
668  return "NVPTXISD::LoadV2";
669  case NVPTXISD::LoadV4:
670  return "NVPTXISD::LoadV4";
671  case NVPTXISD::LDGV2:
672  return "NVPTXISD::LDGV2";
673  case NVPTXISD::LDGV4:
674  return "NVPTXISD::LDGV4";
675  case NVPTXISD::LDUV2:
676  return "NVPTXISD::LDUV2";
677  case NVPTXISD::LDUV4:
678  return "NVPTXISD::LDUV4";
679  case NVPTXISD::StoreV2:
680  return "NVPTXISD::StoreV2";
681  case NVPTXISD::StoreV4:
682  return "NVPTXISD::StoreV4";
684  return "NVPTXISD::FUN_SHFL_CLAMP";
686  return "NVPTXISD::FUN_SHFR_CLAMP";
687  case NVPTXISD::IMAD:
688  return "NVPTXISD::IMAD";
690  return "NVPTXISD::SETP_F16X2";
691  case NVPTXISD::Dummy:
692  return "NVPTXISD::Dummy";
694  return "NVPTXISD::MUL_WIDE_SIGNED";
696  return "NVPTXISD::MUL_WIDE_UNSIGNED";
697  case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
698  case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
700  return "NVPTXISD::Tex1DFloatFloatLevel";
702  return "NVPTXISD::Tex1DFloatFloatGrad";
703  case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
704  case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
706  return "NVPTXISD::Tex1DS32FloatLevel";
708  return "NVPTXISD::Tex1DS32FloatGrad";
709  case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
710  case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
712  return "NVPTXISD::Tex1DU32FloatLevel";
714  return "NVPTXISD::Tex1DU32FloatGrad";
715  case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
716  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
718  return "NVPTXISD::Tex1DArrayFloatFloatLevel";
720  return "NVPTXISD::Tex1DArrayFloatFloatGrad";
721  case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
722  case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
724  return "NVPTXISD::Tex1DArrayS32FloatLevel";
726  return "NVPTXISD::Tex1DArrayS32FloatGrad";
727  case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
728  case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
730  return "NVPTXISD::Tex1DArrayU32FloatLevel";
732  return "NVPTXISD::Tex1DArrayU32FloatGrad";
733  case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
734  case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
736  return "NVPTXISD::Tex2DFloatFloatLevel";
738  return "NVPTXISD::Tex2DFloatFloatGrad";
739  case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
740  case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
742  return "NVPTXISD::Tex2DS32FloatLevel";
744  return "NVPTXISD::Tex2DS32FloatGrad";
745  case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
746  case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
748  return "NVPTXISD::Tex2DU32FloatLevel";
750  return "NVPTXISD::Tex2DU32FloatGrad";
751  case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
752  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
754  return "NVPTXISD::Tex2DArrayFloatFloatLevel";
756  return "NVPTXISD::Tex2DArrayFloatFloatGrad";
757  case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
758  case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
760  return "NVPTXISD::Tex2DArrayS32FloatLevel";
762  return "NVPTXISD::Tex2DArrayS32FloatGrad";
763  case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
764  case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
766  return "NVPTXISD::Tex2DArrayU32FloatLevel";
768  return "NVPTXISD::Tex2DArrayU32FloatGrad";
769  case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
770  case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
772  return "NVPTXISD::Tex3DFloatFloatLevel";
774  return "NVPTXISD::Tex3DFloatFloatGrad";
775  case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
776  case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
778  return "NVPTXISD::Tex3DS32FloatLevel";
780  return "NVPTXISD::Tex3DS32FloatGrad";
781  case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
782  case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
784  return "NVPTXISD::Tex3DU32FloatLevel";
786  return "NVPTXISD::Tex3DU32FloatGrad";
787  case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
789  return "NVPTXISD::TexCubeFloatFloatLevel";
790  case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
792  return "NVPTXISD::TexCubeS32FloatLevel";
793  case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
795  return "NVPTXISD::TexCubeU32FloatLevel";
797  return "NVPTXISD::TexCubeArrayFloatFloat";
799  return "NVPTXISD::TexCubeArrayFloatFloatLevel";
801  return "NVPTXISD::TexCubeArrayS32Float";
803  return "NVPTXISD::TexCubeArrayS32FloatLevel";
805  return "NVPTXISD::TexCubeArrayU32Float";
807  return "NVPTXISD::TexCubeArrayU32FloatLevel";
809  return "NVPTXISD::Tld4R2DFloatFloat";
811  return "NVPTXISD::Tld4G2DFloatFloat";
813  return "NVPTXISD::Tld4B2DFloatFloat";
815  return "NVPTXISD::Tld4A2DFloatFloat";
817  return "NVPTXISD::Tld4R2DS64Float";
819  return "NVPTXISD::Tld4G2DS64Float";
821  return "NVPTXISD::Tld4B2DS64Float";
823  return "NVPTXISD::Tld4A2DS64Float";
825  return "NVPTXISD::Tld4R2DU64Float";
827  return "NVPTXISD::Tld4G2DU64Float";
829  return "NVPTXISD::Tld4B2DU64Float";
831  return "NVPTXISD::Tld4A2DU64Float";
832 
834  return "NVPTXISD::TexUnified1DFloatS32";
836  return "NVPTXISD::TexUnified1DFloatFloat";
838  return "NVPTXISD::TexUnified1DFloatFloatLevel";
840  return "NVPTXISD::TexUnified1DFloatFloatGrad";
842  return "NVPTXISD::TexUnified1DS32S32";
844  return "NVPTXISD::TexUnified1DS32Float";
846  return "NVPTXISD::TexUnified1DS32FloatLevel";
848  return "NVPTXISD::TexUnified1DS32FloatGrad";
850  return "NVPTXISD::TexUnified1DU32S32";
852  return "NVPTXISD::TexUnified1DU32Float";
854  return "NVPTXISD::TexUnified1DU32FloatLevel";
856  return "NVPTXISD::TexUnified1DU32FloatGrad";
858  return "NVPTXISD::TexUnified1DArrayFloatS32";
860  return "NVPTXISD::TexUnified1DArrayFloatFloat";
862  return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
864  return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
866  return "NVPTXISD::TexUnified1DArrayS32S32";
868  return "NVPTXISD::TexUnified1DArrayS32Float";
870  return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
872  return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
874  return "NVPTXISD::TexUnified1DArrayU32S32";
876  return "NVPTXISD::TexUnified1DArrayU32Float";
878  return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
880  return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
882  return "NVPTXISD::TexUnified2DFloatS32";
884  return "NVPTXISD::TexUnified2DFloatFloat";
886  return "NVPTXISD::TexUnified2DFloatFloatLevel";
888  return "NVPTXISD::TexUnified2DFloatFloatGrad";
890  return "NVPTXISD::TexUnified2DS32S32";
892  return "NVPTXISD::TexUnified2DS32Float";
894  return "NVPTXISD::TexUnified2DS32FloatLevel";
896  return "NVPTXISD::TexUnified2DS32FloatGrad";
898  return "NVPTXISD::TexUnified2DU32S32";
900  return "NVPTXISD::TexUnified2DU32Float";
902  return "NVPTXISD::TexUnified2DU32FloatLevel";
904  return "NVPTXISD::TexUnified2DU32FloatGrad";
906  return "NVPTXISD::TexUnified2DArrayFloatS32";
908  return "NVPTXISD::TexUnified2DArrayFloatFloat";
910  return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
912  return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
914  return "NVPTXISD::TexUnified2DArrayS32S32";
916  return "NVPTXISD::TexUnified2DArrayS32Float";
918  return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
920  return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
922  return "NVPTXISD::TexUnified2DArrayU32S32";
924  return "NVPTXISD::TexUnified2DArrayU32Float";
926  return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
928  return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
930  return "NVPTXISD::TexUnified3DFloatS32";
932  return "NVPTXISD::TexUnified3DFloatFloat";
934  return "NVPTXISD::TexUnified3DFloatFloatLevel";
936  return "NVPTXISD::TexUnified3DFloatFloatGrad";
938  return "NVPTXISD::TexUnified3DS32S32";
940  return "NVPTXISD::TexUnified3DS32Float";
942  return "NVPTXISD::TexUnified3DS32FloatLevel";
944  return "NVPTXISD::TexUnified3DS32FloatGrad";
946  return "NVPTXISD::TexUnified3DU32S32";
948  return "NVPTXISD::TexUnified3DU32Float";
950  return "NVPTXISD::TexUnified3DU32FloatLevel";
952  return "NVPTXISD::TexUnified3DU32FloatGrad";
954  return "NVPTXISD::TexUnifiedCubeFloatFloat";
956  return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
958  return "NVPTXISD::TexUnifiedCubeS32Float";
960  return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
962  return "NVPTXISD::TexUnifiedCubeU32Float";
964  return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
966  return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
968  return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
970  return "NVPTXISD::TexUnifiedCubeArrayS32Float";
972  return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
974  return "NVPTXISD::TexUnifiedCubeArrayU32Float";
976  return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
978  return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
980  return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
982  return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
984  return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
986  return "NVPTXISD::Tld4UnifiedR2DS64Float";
988  return "NVPTXISD::Tld4UnifiedG2DS64Float";
990  return "NVPTXISD::Tld4UnifiedB2DS64Float";
992  return "NVPTXISD::Tld4UnifiedA2DS64Float";
994  return "NVPTXISD::Tld4UnifiedR2DU64Float";
996  return "NVPTXISD::Tld4UnifiedG2DU64Float";
998  return "NVPTXISD::Tld4UnifiedB2DU64Float";
1000  return "NVPTXISD::Tld4UnifiedA2DU64Float";
1001 
1002  case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1003  case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1004  case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1005  case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1006  case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1007  case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1008  case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1009  case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1010  case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1011  case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1012  case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1013 
1014  case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1015  case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1016  case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1017  case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1018  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1019  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1020  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1021  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1022  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1023  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1024  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1025 
1026  case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1027  case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1028  case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1029  case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1030  case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1031  case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1032  case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1033  case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1034  case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1035  case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1036  case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1037 
1038  case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1039  case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1040  case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1041  case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1042  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1043  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1044  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1045  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1046  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1047  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1048  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1049 
1050  case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1051  case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1052  case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1053  case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1054  case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1055  case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1056  case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1057  case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1058  case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1059  case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1060  case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1061 
1062  case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1063  case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1064  case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1065  case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1066  case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1067  case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1068  case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1069  case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1070  case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1071  case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1072  case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1073 
1074  case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1075  case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1076  case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1077  case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1078  case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1079  case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1080  case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1081  case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1082  case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1083  case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1084  case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1085 
1086  case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1087  case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1088  case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1089  case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1090  case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1091  case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1092  case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1093  case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1094  case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1095  case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1096  case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1097 
1098  case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1099  case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1100  case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1101  case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1102  case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1103  case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1104  case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1105  case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1106  case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1107  case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1108  case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1109 
1110  case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1111  case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1112  case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1113  case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1114  case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1115  case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1116  case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1117  case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1118  case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1119  case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1120  case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1121 
1122  case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1123  case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1124  case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1125  case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1126  case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1127  case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1128  case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1129  case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1130  case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1131  case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1132  case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1133 
1134  case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1135  case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1136  case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1137  case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1138  case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1139  case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1140  case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1141  case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1142  case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1143  case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1144  case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1145 
1146  case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1147  case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1148  case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1149  case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1150  case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1151  case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1152  case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1153  case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1154  case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1155  case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1156  case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1157 
1158  case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1159  case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1160  case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1161  case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1162  case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1163  case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1164  case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1165  case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1166  case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1167  case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1168  case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1169 
1170  case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1171  case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1172  case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1173  case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1174  case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1175  case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1176  case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1177  case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1178  case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1179  case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1180  case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1181  }
1182  return nullptr;
1183 }
1184 
1187  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1188  return TypeSplitVector;
1189  if (VT == MVT::v2f16)
1190  return TypeLegal;
1192 }
1193 
1195  int Enabled, int &ExtraSteps,
1196  bool &UseOneConst,
1197  bool Reciprocal) const {
1198  if (!(Enabled == ReciprocalEstimate::Enabled ||
1199  (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1200  return SDValue();
1201 
1202  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1203  ExtraSteps = 0;
1204 
1205  SDLoc DL(Operand);
1206  EVT VT = Operand.getValueType();
1207  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1208 
1209  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1210  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1211  DAG.getConstant(IID, DL, MVT::i32), Operand);
1212  };
1213 
1214  // The sqrt and rsqrt refinement processes assume we always start out with an
1215  // approximation of the rsqrt. Therefore, if we're going to do any refinement
1216  // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1217  // any refinement, we must return a regular sqrt.
1218  if (Reciprocal || ExtraSteps > 0) {
1219  if (VT == MVT::f32)
1220  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1221  : Intrinsic::nvvm_rsqrt_approx_f);
1222  else if (VT == MVT::f64)
1223  return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1224  else
1225  return SDValue();
1226  } else {
1227  if (VT == MVT::f32)
1228  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1229  : Intrinsic::nvvm_sqrt_approx_f);
1230  else {
1231  // There's no sqrt.approx.f64 instruction, so we emit
1232  // reciprocal(rsqrt(x)). This is faster than
1233  // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1234  // x * rsqrt(x).)
1235  return DAG.getNode(
1236  ISD::INTRINSIC_WO_CHAIN, DL, VT,
1237  DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1238  MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1239  }
1240  }
1241 }
1242 
1243 SDValue
1245  SDLoc dl(Op);
1246  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1247  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1248  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1249  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1250 }
1251 
1253  const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1254  const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1255  ImmutableCallSite CS) const {
1256  auto PtrVT = getPointerTy(DL);
1257 
1258  bool isABI = (STI.getSmVersion() >= 20);
1259  assert(isABI && "Non-ABI compilation is not supported");
1260  if (!isABI)
1261  return "";
1262 
1263  std::stringstream O;
1264  O << "prototype_" << uniqueCallSite << " : .callprototype ";
1265 
1266  if (retTy->getTypeID() == Type::VoidTyID) {
1267  O << "()";
1268  } else {
1269  O << "(";
1270  if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1271  unsigned size = 0;
1272  if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1273  size = ITy->getBitWidth();
1274  } else {
1275  assert(retTy->isFloatingPointTy() &&
1276  "Floating point type expected here");
1277  size = retTy->getPrimitiveSizeInBits();
1278  }
1279  // PTX ABI requires all scalar return values to be at least 32
1280  // bits in size. fp16 normally uses .b16 as its storage type in
1281  // PTX, so its size must be adjusted here, too.
1282  if (size < 32)
1283  size = 32;
1284 
1285  O << ".param .b" << size << " _";
1286  } else if (isa<PointerType>(retTy)) {
1287  O << ".param .b" << PtrVT.getSizeInBits() << " _";
1288  } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
1289  auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
1290  O << ".param .align " << retAlignment << " .b8 _["
1291  << DL.getTypeAllocSize(retTy) << "]";
1292  } else {
1293  llvm_unreachable("Unknown return type");
1294  }
1295  O << ") ";
1296  }
1297  O << "_ (";
1298 
1299  bool first = true;
1300 
1301  unsigned OIdx = 0;
1302  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1303  Type *Ty = Args[i].Ty;
1304  if (!first) {
1305  O << ", ";
1306  }
1307  first = false;
1308 
1309  if (!Outs[OIdx].Flags.isByVal()) {
1310  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1311  unsigned align = 0;
1312  const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1313  // +1 because index 0 is reserved for return type alignment
1314  if (!getAlign(*CallI, i + 1, align))
1315  align = DL.getABITypeAlignment(Ty);
1316  unsigned sz = DL.getTypeAllocSize(Ty);
1317  O << ".param .align " << align << " .b8 ";
1318  O << "_";
1319  O << "[" << sz << "]";
1320  // update the index for Outs
1321  SmallVector<EVT, 16> vtparts;
1322  ComputeValueVTs(*this, DL, Ty, vtparts);
1323  if (unsigned len = vtparts.size())
1324  OIdx += len - 1;
1325  continue;
1326  }
1327  // i8 types in IR will be i16 types in SDAG
1328  assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1329  (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1330  "type mismatch between callee prototype and arguments");
1331  // scalar type
1332  unsigned sz = 0;
1333  if (isa<IntegerType>(Ty)) {
1334  sz = cast<IntegerType>(Ty)->getBitWidth();
1335  if (sz < 32)
1336  sz = 32;
1337  } else if (isa<PointerType>(Ty)) {
1338  sz = PtrVT.getSizeInBits();
1339  } else if (Ty->isHalfTy())
1340  // PTX ABI requires all scalar parameters to be at least 32
1341  // bits in size. fp16 normally uses .b16 as its storage type
1342  // in PTX, so its size must be adjusted here, too.
1343  sz = 32;
1344  else
1345  sz = Ty->getPrimitiveSizeInBits();
1346  O << ".param .b" << sz << " ";
1347  O << "_";
1348  continue;
1349  }
1350  auto *PTy = dyn_cast<PointerType>(Ty);
1351  assert(PTy && "Param with byval attribute should be a pointer type");
1352  Type *ETy = PTy->getElementType();
1353 
1354  unsigned align = Outs[OIdx].Flags.getByValAlign();
1355  unsigned sz = DL.getTypeAllocSize(ETy);
1356  O << ".param .align " << align << " .b8 ";
1357  O << "_";
1358  O << "[" << sz << "]";
1359  }
1360  O << ");";
1361  return O.str();
1362 }
1363 
1364 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1365  ImmutableCallSite CS,
1366  Type *Ty, unsigned Idx,
1367  const DataLayout &DL) const {
1368  if (!CS) {
1369  // CallSite is zero, fallback to ABI type alignment
1370  return DL.getABITypeAlignment(Ty);
1371  }
1372 
1373  unsigned Align = 0;
1374  const Value *DirectCallee = CS.getCalledFunction();
1375 
1376  if (!DirectCallee) {
1377  // We don't have a direct function symbol, but that may be because of
1378  // constant cast instructions in the call.
1379  const Instruction *CalleeI = CS.getInstruction();
1380  assert(CalleeI && "Call target is not a function or derived value?");
1381 
1382  // With bitcast'd call targets, the instruction will be the call
1383  if (isa<CallInst>(CalleeI)) {
1384  // Check if we have call alignment metadata
1385  if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1386  return Align;
1387 
1388  const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1389  // Ignore any bitcast instructions
1390  while (isa<ConstantExpr>(CalleeV)) {
1391  const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1392  if (!CE->isCast())
1393  break;
1394  // Look through the bitcast
1395  CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1396  }
1397 
1398  // We have now looked past all of the bitcasts. Do we finally have a
1399  // Function?
1400  if (isa<Function>(CalleeV))
1401  DirectCallee = CalleeV;
1402  }
1403  }
1404 
1405  // Check for function alignment information if we found that the
1406  // ultimate target is a Function
1407  if (DirectCallee)
1408  if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1409  return Align;
1410 
1411  // Call is indirect or alignment information is not available, fall back to
1412  // the ABI type alignment
1413  return DL.getABITypeAlignment(Ty);
1414 }
1415 
1417  SmallVectorImpl<SDValue> &InVals) const {
1418  SelectionDAG &DAG = CLI.DAG;
1419  SDLoc dl = CLI.DL;
1421  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1423  SDValue Chain = CLI.Chain;
1424  SDValue Callee = CLI.Callee;
1425  bool &isTailCall = CLI.IsTailCall;
1426  ArgListTy &Args = CLI.getArgs();
1427  Type *RetTy = CLI.RetTy;
1428  ImmutableCallSite CS = CLI.CS;
1429  const DataLayout &DL = DAG.getDataLayout();
1430 
1431  bool isABI = (STI.getSmVersion() >= 20);
1432  assert(isABI && "Non-ABI compilation is not supported");
1433  if (!isABI)
1434  return Chain;
1435 
1436  SDValue tempChain = Chain;
1437  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1438  SDValue InFlag = Chain.getValue(1);
1439 
1440  unsigned paramCount = 0;
1441  // Args.size() and Outs.size() need not match.
1442  // Outs.size() will be larger
1443  // * if there is an aggregate argument with multiple fields (each field
1444  // showing up separately in Outs)
1445  // * if there is a vector argument with more than typical vector-length
1446  // elements (generally if more than 4) where each vector element is
1447  // individually present in Outs.
1448  // So a different index should be used for indexing into Outs/OutVals.
1449  // See similar issue in LowerFormalArguments.
1450  unsigned OIdx = 0;
1451  // Declare the .params or .reg need to pass values
1452  // to the function
1453  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1454  EVT VT = Outs[OIdx].VT;
1455  Type *Ty = Args[i].Ty;
1456 
1457  if (!Outs[OIdx].Flags.isByVal()) {
1460  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1461  unsigned ArgAlign =
1462  getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1463  unsigned AllocSize = DL.getTypeAllocSize(Ty);
1464  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1465  bool NeedAlign; // Does argument declaration specify alignment?
1466  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1467  // declare .param .align <align> .b8 .param<n>[<size>];
1468  SDValue DeclareParamOps[] = {
1469  Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1470  DAG.getConstant(paramCount, dl, MVT::i32),
1471  DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1472  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1473  DeclareParamOps);
1474  NeedAlign = true;
1475  } else {
1476  // declare .param .b<size> .param<n>;
1477  if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1478  // PTX ABI requires integral types to be at least 32 bits in
1479  // size. FP16 is loaded/stored using i16, so it's handled
1480  // here as well.
1481  AllocSize = 4;
1482  }
1483  SDValue DeclareScalarParamOps[] = {
1484  Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1485  DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1486  DAG.getConstant(0, dl, MVT::i32), InFlag};
1487  Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1488  DeclareScalarParamOps);
1489  NeedAlign = false;
1490  }
1491  InFlag = Chain.getValue(1);
1492 
1493  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1494  // than 32-bits are sign extended or zero extended, depending on
1495  // whether they are signed or unsigned types. This case applies
1496  // only to scalar parameters and not to aggregate values.
1497  bool ExtendIntegerParam =
1498  Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1499 
1500  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1501  SmallVector<SDValue, 6> StoreOperands;
1502  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1503  // New store.
1504  if (VectorInfo[j] & PVF_FIRST) {
1505  assert(StoreOperands.empty() && "Unfinished preceding store.");
1506  StoreOperands.push_back(Chain);
1507  StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1508  StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1509  }
1510 
1511  EVT EltVT = VTs[j];
1512  SDValue StVal = OutVals[OIdx];
1513  if (ExtendIntegerParam) {
1514  assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1515  // zext/sext to i32
1516  StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1517  : ISD::ZERO_EXTEND,
1518  dl, MVT::i32, StVal);
1519  } else if (EltVT.getSizeInBits() < 16) {
1520  // Use 16-bit registers for small stores as it's the
1521  // smallest general purpose register size supported by NVPTX.
1522  StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1523  }
1524 
1525  // Record the value to store.
1526  StoreOperands.push_back(StVal);
1527 
1528  if (VectorInfo[j] & PVF_LAST) {
1529  unsigned NumElts = StoreOperands.size() - 3;
1531  switch (NumElts) {
1532  case 1:
1533  Op = NVPTXISD::StoreParam;
1534  break;
1535  case 2:
1537  break;
1538  case 4:
1540  break;
1541  default:
1542  llvm_unreachable("Invalid vector info.");
1543  }
1544 
1545  StoreOperands.push_back(InFlag);
1546 
1547  // Adjust type of the store op if we've extended the scalar
1548  // return value.
1549  EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1550  unsigned EltAlign =
1551  NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
1552 
1553  Chain = DAG.getMemIntrinsicNode(
1554  Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1555  TheStoreType, MachinePointerInfo(), EltAlign,
1557  InFlag = Chain.getValue(1);
1558 
1559  // Cleanup.
1560  StoreOperands.clear();
1561  }
1562  ++OIdx;
1563  }
1564  assert(StoreOperands.empty() && "Unfinished parameter store.");
1565  if (VTs.size() > 0)
1566  --OIdx;
1567  ++paramCount;
1568  continue;
1569  }
1570 
1571  // ByVal arguments
1574  auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1575  assert(PTy && "Type of a byval parameter should be pointer");
1576  ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1577 
1578  // declare .param .align <align> .b8 .param<n>[<size>];
1579  unsigned sz = Outs[OIdx].Flags.getByValSize();
1580  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1581  unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1582  // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1583  // so we don't need to worry about natural alignment or not.
1584  // See TargetLowering::LowerCallTo().
1585 
1586  // Enforce minumum alignment of 4 to work around ptxas miscompile
1587  // for sm_50+. See corresponding alignment adjustment in
1588  // emitFunctionParamList() for details.
1589  if (ArgAlign < 4)
1590  ArgAlign = 4;
1591  SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1592  DAG.getConstant(paramCount, dl, MVT::i32),
1593  DAG.getConstant(sz, dl, MVT::i32), InFlag};
1594  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1595  DeclareParamOps);
1596  InFlag = Chain.getValue(1);
1597  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1598  EVT elemtype = VTs[j];
1599  int curOffset = Offsets[j];
1600  unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1601  auto PtrVT = getPointerTy(DL);
1602  SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1603  DAG.getConstant(curOffset, dl, PtrVT));
1604  SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1605  MachinePointerInfo(), PartAlign);
1606  if (elemtype.getSizeInBits() < 16) {
1607  theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1608  }
1609  SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1610  SDValue CopyParamOps[] = { Chain,
1611  DAG.getConstant(paramCount, dl, MVT::i32),
1612  DAG.getConstant(curOffset, dl, MVT::i32),
1613  theVal, InFlag };
1614  Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1615  CopyParamOps, elemtype,
1616  MachinePointerInfo(), /* Align */ 0,
1618 
1619  InFlag = Chain.getValue(1);
1620  }
1621  ++paramCount;
1622  }
1623 
1625  unsigned retAlignment = 0;
1626 
1627  // Handle Result
1628  if (Ins.size() > 0) {
1629  SmallVector<EVT, 16> resvtparts;
1630  ComputeValueVTs(*this, DL, RetTy, resvtparts);
1631 
1632  // Declare
1633  // .param .align 16 .b8 retval0[<size-in-bytes>], or
1634  // .param .b<size-in-bits> retval0
1635  unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1636  // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1637  // these three types to match the logic in
1638  // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1639  // Plus, this behavior is consistent with nvcc's.
1640  if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1641  (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1642  // Scalar needs to be at least 32bit wide
1643  if (resultsz < 32)
1644  resultsz = 32;
1645  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1646  SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1647  DAG.getConstant(resultsz, dl, MVT::i32),
1648  DAG.getConstant(0, dl, MVT::i32), InFlag };
1649  Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1650  DeclareRetOps);
1651  InFlag = Chain.getValue(1);
1652  } else {
1653  retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1654  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1655  SDValue DeclareRetOps[] = { Chain,
1656  DAG.getConstant(retAlignment, dl, MVT::i32),
1657  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1658  DAG.getConstant(0, dl, MVT::i32), InFlag };
1659  Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1660  DeclareRetOps);
1661  InFlag = Chain.getValue(1);
1662  }
1663  }
1664 
1665  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1666  // between them we must rely on the call site value which is valid for
1667  // indirect calls but is always null for libcalls.
1668  bool isIndirectCall = !Func && CS;
1669 
1670  if (isa<ExternalSymbolSDNode>(Callee)) {
1671  Function* CalleeFunc = nullptr;
1672 
1673  // Try to find the callee in the current module.
1674  Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1675  assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1676 
1677  // Set the "libcall callee" attribute to indicate that the function
1678  // must always have a declaration.
1679  CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1680  }
1681 
1682  if (isIndirectCall) {
1683  // This is indirect function call case : PTX requires a prototype of the
1684  // form
1685  // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1686  // to be emitted, and the label has to used as the last arg of call
1687  // instruction.
1688  // The prototype is embedded in a string and put as the operand for a
1689  // CallPrototype SDNode which will print out to the value of the string.
1690  SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1691  std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1692  const char *ProtoStr =
1693  nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1694  SDValue ProtoOps[] = {
1695  Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1696  };
1697  Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1698  InFlag = Chain.getValue(1);
1699  }
1700  // Op to just print "call"
1701  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1702  SDValue PrintCallOps[] = {
1703  Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1704  };
1705  // We model convergent calls as separate opcodes.
1706  unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1707  if (CLI.IsConvergent)
1710  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1711  InFlag = Chain.getValue(1);
1712 
1713  // Ops to print out the function name
1714  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1715  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1716  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1717  InFlag = Chain.getValue(1);
1718 
1719  // Ops to print out the param list
1720  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1721  SDValue CallArgBeginOps[] = { Chain, InFlag };
1722  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1723  CallArgBeginOps);
1724  InFlag = Chain.getValue(1);
1725 
1726  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1727  unsigned opcode;
1728  if (i == (e - 1))
1729  opcode = NVPTXISD::LastCallArg;
1730  else
1731  opcode = NVPTXISD::CallArg;
1732  SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1733  SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1734  DAG.getConstant(i, dl, MVT::i32), InFlag };
1735  Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1736  InFlag = Chain.getValue(1);
1737  }
1738  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1739  SDValue CallArgEndOps[] = { Chain,
1740  DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1741  InFlag };
1742  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1743  InFlag = Chain.getValue(1);
1744 
1745  if (isIndirectCall) {
1746  SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1747  SDValue PrototypeOps[] = { Chain,
1749  InFlag };
1750  Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1751  InFlag = Chain.getValue(1);
1752  }
1753 
1754  SmallVector<SDValue, 16> ProxyRegOps;
1755  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1756 
1757  // Generate loads from param memory/moves from registers for result
1758  if (Ins.size() > 0) {
1761  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1762  assert(VTs.size() == Ins.size() && "Bad value decomposition");
1763 
1764  unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1765  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1766 
1767  SmallVector<EVT, 6> LoadVTs;
1768  int VecIdx = -1; // Index of the first element of the vector.
1769 
1770  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1771  // 32-bits are sign extended or zero extended, depending on whether
1772  // they are signed or unsigned types.
1773  bool ExtendIntegerRetVal =
1774  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1775 
1776  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1777  bool needTruncate = false;
1778  EVT TheLoadType = VTs[i];
1779  EVT EltType = Ins[i].VT;
1780  unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1781  if (ExtendIntegerRetVal) {
1782  TheLoadType = MVT::i32;
1783  EltType = MVT::i32;
1784  needTruncate = true;
1785  } else if (TheLoadType.getSizeInBits() < 16) {
1786  if (VTs[i].isInteger())
1787  needTruncate = true;
1788  EltType = MVT::i16;
1789  }
1790 
1791  // Record index of the very first element of the vector.
1792  if (VectorInfo[i] & PVF_FIRST) {
1793  assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1794  VecIdx = i;
1795  }
1796 
1797  LoadVTs.push_back(EltType);
1798 
1799  if (VectorInfo[i] & PVF_LAST) {
1800  unsigned NumElts = LoadVTs.size();
1801  LoadVTs.push_back(MVT::Other);
1802  LoadVTs.push_back(MVT::Glue);
1804  switch (NumElts) {
1805  case 1:
1806  Op = NVPTXISD::LoadParam;
1807  break;
1808  case 2:
1809  Op = NVPTXISD::LoadParamV2;
1810  break;
1811  case 4:
1812  Op = NVPTXISD::LoadParamV4;
1813  break;
1814  default:
1815  llvm_unreachable("Invalid vector info.");
1816  }
1817 
1818  SDValue LoadOperands[] = {
1819  Chain, DAG.getConstant(1, dl, MVT::i32),
1820  DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1821  SDValue RetVal = DAG.getMemIntrinsicNode(
1822  Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1823  MachinePointerInfo(), EltAlign,
1825 
1826  for (unsigned j = 0; j < NumElts; ++j) {
1827  ProxyRegOps.push_back(RetVal.getValue(j));
1828 
1829  if (needTruncate)
1830  ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1831  else
1832  ProxyRegTruncates.push_back(Optional<MVT>());
1833  }
1834 
1835  Chain = RetVal.getValue(NumElts);
1836  InFlag = RetVal.getValue(NumElts + 1);
1837 
1838  // Cleanup
1839  VecIdx = -1;
1840  LoadVTs.clear();
1841  }
1842  }
1843  }
1844 
1845  Chain = DAG.getCALLSEQ_END(Chain,
1846  DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1847  DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1848  true),
1849  InFlag, dl);
1850  InFlag = Chain.getValue(1);
1851  uniqueCallSite++;
1852 
1853  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1854  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1855  // dangling.
1856  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1857  SDValue Ret = DAG.getNode(
1858  NVPTXISD::ProxyReg, dl,
1859  DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1860  { Chain, ProxyRegOps[i], InFlag }
1861  );
1862 
1863  Chain = Ret.getValue(1);
1864  InFlag = Ret.getValue(2);
1865 
1866  if (ProxyRegTruncates[i].hasValue()) {
1867  Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1868  }
1869 
1870  InVals.push_back(Ret);
1871  }
1872 
1873  // set isTailCall to false for now, until we figure out how to express
1874  // tail call optimization in PTX
1875  isTailCall = false;
1876  return Chain;
1877 }
1878 
1879 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1880 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1881 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1882 SDValue
1883 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1884  SDNode *Node = Op.getNode();
1885  SDLoc dl(Node);
1887  unsigned NumOperands = Node->getNumOperands();
1888  for (unsigned i = 0; i < NumOperands; ++i) {
1889  SDValue SubOp = Node->getOperand(i);
1890  EVT VVT = SubOp.getNode()->getValueType(0);
1891  EVT EltVT = VVT.getVectorElementType();
1892  unsigned NumSubElem = VVT.getVectorNumElements();
1893  for (unsigned j = 0; j < NumSubElem; ++j) {
1894  Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1895  DAG.getIntPtrConstant(j, dl)));
1896  }
1897  }
1898  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1899 }
1900 
1901 // We can init constant f16x2 with a single .b32 move. Normally it
1902 // would get lowered as two constant loads and vector-packing move.
1903 // mov.b16 %h1, 0x4000;
1904 // mov.b16 %h2, 0x3C00;
1905 // mov.b32 %hh2, {%h2, %h1};
1906 // Instead we want just a constant move:
1907 // mov.b32 %hh2, 0x40003C00
1908 //
1909 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1910 // generates good SASS in both cases.
1911 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1912  SelectionDAG &DAG) const {
1913  //return Op;
1914  if (!(Op->getValueType(0) == MVT::v2f16 &&
1915  isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1916  isa<ConstantFPSDNode>(Op->getOperand(1))))
1917  return Op;
1918 
1919  APInt E0 =
1920  cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1921  APInt E1 =
1922  cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1923  SDValue Const =
1924  DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1925  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1926 }
1927 
1928 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1929  SelectionDAG &DAG) const {
1930  SDValue Index = Op->getOperand(1);
1931  // Constant index will be matched by tablegen.
1932  if (isa<ConstantSDNode>(Index.getNode()))
1933  return Op;
1934 
1935  // Extract individual elements and select one of them.
1936  SDValue Vector = Op->getOperand(0);
1937  EVT VectorVT = Vector.getValueType();
1938  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1939  EVT EltVT = VectorVT.getVectorElementType();
1940 
1941  SDLoc dl(Op.getNode());
1942  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1943  DAG.getIntPtrConstant(0, dl));
1944  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1945  DAG.getIntPtrConstant(1, dl));
1946  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1948 }
1949 
1950 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1951 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1952 /// amount, or
1953 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1954 /// amount.
1955 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1956  SelectionDAG &DAG) const {
1957  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1959 
1960  EVT VT = Op.getValueType();
1961  unsigned VTBits = VT.getSizeInBits();
1962  SDLoc dl(Op);
1963  SDValue ShOpLo = Op.getOperand(0);
1964  SDValue ShOpHi = Op.getOperand(1);
1965  SDValue ShAmt = Op.getOperand(2);
1966  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1967 
1968  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1969  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1970  // {dHi, dLo} = {aHi, aLo} >> Amt
1971  // dHi = aHi >> Amt
1972  // dLo = shf.r.clamp aLo, aHi, Amt
1973 
1974  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1975  SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1976  ShAmt);
1977 
1978  SDValue Ops[2] = { Lo, Hi };
1979  return DAG.getMergeValues(Ops, dl);
1980  }
1981  else {
1982  // {dHi, dLo} = {aHi, aLo} >> Amt
1983  // - if (Amt>=size) then
1984  // dLo = aHi >> (Amt-size)
1985  // dHi = aHi >> Amt (this is either all 0 or all 1)
1986  // else
1987  // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1988  // dHi = aHi >> Amt
1989 
1990  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1991  DAG.getConstant(VTBits, dl, MVT::i32),
1992  ShAmt);
1993  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1994  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1995  DAG.getConstant(VTBits, dl, MVT::i32));
1996  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1997  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1998  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
1999 
2000  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2001  DAG.getConstant(VTBits, dl, MVT::i32),
2002  ISD::SETGE);
2003  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2004  SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2005 
2006  SDValue Ops[2] = { Lo, Hi };
2007  return DAG.getMergeValues(Ops, dl);
2008  }
2009 }
2010 
2011 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2012 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2013 /// amount, or
2014 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2015 /// amount.
2016 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2017  SelectionDAG &DAG) const {
2018  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2019  assert(Op.getOpcode() == ISD::SHL_PARTS);
2020 
2021  EVT VT = Op.getValueType();
2022  unsigned VTBits = VT.getSizeInBits();
2023  SDLoc dl(Op);
2024  SDValue ShOpLo = Op.getOperand(0);
2025  SDValue ShOpHi = Op.getOperand(1);
2026  SDValue ShAmt = Op.getOperand(2);
2027 
2028  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2029  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2030  // {dHi, dLo} = {aHi, aLo} << Amt
2031  // dHi = shf.l.clamp aLo, aHi, Amt
2032  // dLo = aLo << Amt
2033 
2034  SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2035  ShAmt);
2036  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2037 
2038  SDValue Ops[2] = { Lo, Hi };
2039  return DAG.getMergeValues(Ops, dl);
2040  }
2041  else {
2042  // {dHi, dLo} = {aHi, aLo} << Amt
2043  // - if (Amt>=size) then
2044  // dLo = aLo << Amt (all 0)
2045  // dLo = aLo << (Amt-size)
2046  // else
2047  // dLo = aLo << Amt
2048  // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2049 
2050  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2051  DAG.getConstant(VTBits, dl, MVT::i32),
2052  ShAmt);
2053  SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2054  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2055  DAG.getConstant(VTBits, dl, MVT::i32));
2056  SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2057  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2058  SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2059 
2060  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2061  DAG.getConstant(VTBits, dl, MVT::i32),
2062  ISD::SETGE);
2063  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2064  SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2065 
2066  SDValue Ops[2] = { Lo, Hi };
2067  return DAG.getMergeValues(Ops, dl);
2068  }
2069 }
2070 
2071 SDValue
2073  switch (Op.getOpcode()) {
2074  case ISD::RETURNADDR:
2075  return SDValue();
2076  case ISD::FRAMEADDR:
2077  return SDValue();
2078  case ISD::GlobalAddress:
2079  return LowerGlobalAddress(Op, DAG);
2081  return Op;
2082  case ISD::BUILD_VECTOR:
2083  return LowerBUILD_VECTOR(Op, DAG);
2085  return Op;
2087  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2088  case ISD::CONCAT_VECTORS:
2089  return LowerCONCAT_VECTORS(Op, DAG);
2090  case ISD::STORE:
2091  return LowerSTORE(Op, DAG);
2092  case ISD::LOAD:
2093  return LowerLOAD(Op, DAG);
2094  case ISD::SHL_PARTS:
2095  return LowerShiftLeftParts(Op, DAG);
2096  case ISD::SRA_PARTS:
2097  case ISD::SRL_PARTS:
2098  return LowerShiftRightParts(Op, DAG);
2099  case ISD::SELECT:
2100  return LowerSelect(Op, DAG);
2101  default:
2102  llvm_unreachable("Custom lowering not defined for operation");
2103  }
2104 }
2105 
2106 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2107  SDValue Op0 = Op->getOperand(0);
2108  SDValue Op1 = Op->getOperand(1);
2109  SDValue Op2 = Op->getOperand(2);
2110  SDLoc DL(Op.getNode());
2111 
2112  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2113 
2114  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2115  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2116  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2117  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2118 
2119  return Trunc;
2120 }
2121 
2122 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2123  if (Op.getValueType() == MVT::i1)
2124  return LowerLOADi1(Op, DAG);
2125 
2126  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2127  // loads and have to handle it here.
2128  if (Op.getValueType() == MVT::v2f16) {
2129  LoadSDNode *Load = cast<LoadSDNode>(Op);
2130  EVT MemVT = Load->getMemoryVT();
2131  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2132  Load->getAddressSpace(), Load->getAlignment())) {
2133  SDValue Ops[2];
2134  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2135  return DAG.getMergeValues(Ops, SDLoc(Op));
2136  }
2137  }
2138 
2139  return SDValue();
2140 }
2141 
2142 // v = ld i1* addr
2143 // =>
2144 // v1 = ld i8* addr (-> i16)
2145 // v = trunc i16 to i1
2146 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2147  SDNode *Node = Op.getNode();
2148  LoadSDNode *LD = cast<LoadSDNode>(Node);
2149  SDLoc dl(Node);
2151  assert(Node->getValueType(0) == MVT::i1 &&
2152  "Custom lowering for i1 load only");
2153  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2154  LD->getPointerInfo(), LD->getAlignment(),
2155  LD->getMemOperand()->getFlags());
2156  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2157  // The legalizer (the caller) is expecting two values from the legalized
2158  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2159  // in LegalizeDAG.cpp which also uses MergeValues.
2160  SDValue Ops[] = { result, LD->getChain() };
2161  return DAG.getMergeValues(Ops, dl);
2162 }
2163 
2164 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2165  StoreSDNode *Store = cast<StoreSDNode>(Op);
2166  EVT VT = Store->getMemoryVT();
2167 
2168  if (VT == MVT::i1)
2169  return LowerSTOREi1(Op, DAG);
2170 
2171  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2172  // stores and have to handle it here.
2173  if (VT == MVT::v2f16 &&
2174  !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2175  Store->getAddressSpace(), Store->getAlignment()))
2176  return expandUnalignedStore(Store, DAG);
2177 
2178  if (VT.isVector())
2179  return LowerSTOREVector(Op, DAG);
2180 
2181  return SDValue();
2182 }
2183 
2184 SDValue
2185 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2186  SDNode *N = Op.getNode();
2187  SDValue Val = N->getOperand(1);
2188  SDLoc DL(N);
2189  EVT ValVT = Val.getValueType();
2190 
2191  if (ValVT.isVector()) {
2192  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2193  // legal. We can (and should) split that into 2 stores of <2 x double> here
2194  // but I'm leaving that as a TODO for now.
2195  if (!ValVT.isSimple())
2196  return SDValue();
2197  switch (ValVT.getSimpleVT().SimpleTy) {
2198  default:
2199  return SDValue();
2200  case MVT::v2i8:
2201  case MVT::v2i16:
2202  case MVT::v2i32:
2203  case MVT::v2i64:
2204  case MVT::v2f16:
2205  case MVT::v2f32:
2206  case MVT::v2f64:
2207  case MVT::v4i8:
2208  case MVT::v4i16:
2209  case MVT::v4i32:
2210  case MVT::v4f16:
2211  case MVT::v4f32:
2212  case MVT::v8f16: // <4 x f16x2>
2213  // This is a "native" vector type
2214  break;
2215  }
2216 
2217  MemSDNode *MemSD = cast<MemSDNode>(N);
2218  const DataLayout &TD = DAG.getDataLayout();
2219 
2220  unsigned Align = MemSD->getAlignment();
2221  unsigned PrefAlign =
2222  TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2223  if (Align < PrefAlign) {
2224  // This store is not sufficiently aligned, so bail out and let this vector
2225  // store be scalarized. Note that we may still be able to emit smaller
2226  // vector stores. For example, if we are storing a <4 x float> with an
2227  // alignment of 8, this check will fail but the legalizer will try again
2228  // with 2 x <2 x float>, which will succeed with an alignment of 8.
2229  return SDValue();
2230  }
2231 
2232  unsigned Opcode = 0;
2233  EVT EltVT = ValVT.getVectorElementType();
2234  unsigned NumElts = ValVT.getVectorNumElements();
2235 
2236  // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2237  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2238  // stored type to i16 and propagate the "real" type as the memory type.
2239  bool NeedExt = false;
2240  if (EltVT.getSizeInBits() < 16)
2241  NeedExt = true;
2242 
2243  bool StoreF16x2 = false;
2244  switch (NumElts) {
2245  default:
2246  return SDValue();
2247  case 2:
2248  Opcode = NVPTXISD::StoreV2;
2249  break;
2250  case 4:
2251  Opcode = NVPTXISD::StoreV4;
2252  break;
2253  case 8:
2254  // v8f16 is a special case. PTX doesn't have st.v8.f16
2255  // instruction. Instead, we split the vector into v2f16 chunks and
2256  // store them with st.v4.b32.
2257  assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2258  Opcode = NVPTXISD::StoreV4;
2259  StoreF16x2 = true;
2260  break;
2261  }
2262 
2264 
2265  // First is the chain
2266  Ops.push_back(N->getOperand(0));
2267 
2268  if (StoreF16x2) {
2269  // Combine f16,f16 -> v2f16
2270  NumElts /= 2;
2271  for (unsigned i = 0; i < NumElts; ++i) {
2272  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2273  DAG.getIntPtrConstant(i * 2, DL));
2274  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2275  DAG.getIntPtrConstant(i * 2 + 1, DL));
2276  SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2277  Ops.push_back(V2);
2278  }
2279  } else {
2280  // Then the split values
2281  for (unsigned i = 0; i < NumElts; ++i) {
2282  SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2283  DAG.getIntPtrConstant(i, DL));
2284  if (NeedExt)
2285  ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2286  Ops.push_back(ExtVal);
2287  }
2288  }
2289 
2290  // Then any remaining arguments
2291  Ops.append(N->op_begin() + 2, N->op_end());
2292 
2293  SDValue NewSt =
2294  DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2295  MemSD->getMemoryVT(), MemSD->getMemOperand());
2296 
2297  // return DCI.CombineTo(N, NewSt, true);
2298  return NewSt;
2299  }
2300 
2301  return SDValue();
2302 }
2303 
2304 // st i1 v, addr
2305 // =>
2306 // v1 = zxt v to i16
2307 // st.u8 i16, addr
2308 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2309  SDNode *Node = Op.getNode();
2310  SDLoc dl(Node);
2311  StoreSDNode *ST = cast<StoreSDNode>(Node);
2312  SDValue Tmp1 = ST->getChain();
2313  SDValue Tmp2 = ST->getBasePtr();
2314  SDValue Tmp3 = ST->getValue();
2315  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2316  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2317  SDValue Result =
2318  DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2319  ST->getAlignment(), ST->getMemOperand()->getFlags());
2320  return Result;
2321 }
2322 
2323 SDValue
2324 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2325  std::string ParamSym;
2326  raw_string_ostream ParamStr(ParamSym);
2327 
2328  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2329  ParamStr.flush();
2330 
2331  std::string *SavedStr =
2332  nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2333  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2334 }
2335 
2336 // Check to see if the kernel argument is image*_t or sampler_t
2337 
2338 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2339  static const char *const specialTypes[] = { "struct._image2d_t",
2340  "struct._image3d_t",
2341  "struct._sampler_t" };
2342 
2343  Type *Ty = arg->getType();
2344  auto *PTy = dyn_cast<PointerType>(Ty);
2345 
2346  if (!PTy)
2347  return false;
2348 
2349  if (!context)
2350  return false;
2351 
2352  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2353  if (!STy || STy->isLiteral())
2354  return false;
2355 
2356  return std::find(std::begin(specialTypes), std::end(specialTypes),
2357  STy->getName()) != std::end(specialTypes);
2358 }
2359 
2361  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2362  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2363  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2364  MachineFunction &MF = DAG.getMachineFunction();
2365  const DataLayout &DL = DAG.getDataLayout();
2366  auto PtrVT = getPointerTy(DAG.getDataLayout());
2367 
2368  const Function *F = &MF.getFunction();
2369  const AttributeList &PAL = F->getAttributes();
2370  const TargetLowering *TLI = STI.getTargetLowering();
2371 
2372  SDValue Root = DAG.getRoot();
2373  std::vector<SDValue> OutChains;
2374 
2375  bool isABI = (STI.getSmVersion() >= 20);
2376  assert(isABI && "Non-ABI compilation is not supported");
2377  if (!isABI)
2378  return Chain;
2379 
2380  std::vector<Type *> argTypes;
2381  std::vector<const Argument *> theArgs;
2382  for (const Argument &I : F->args()) {
2383  theArgs.push_back(&I);
2384  argTypes.push_back(I.getType());
2385  }
2386  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2387  // Ins.size() will be larger
2388  // * if there is an aggregate argument with multiple fields (each field
2389  // showing up separately in Ins)
2390  // * if there is a vector argument with more than typical vector-length
2391  // elements (generally if more than 4) where each vector element is
2392  // individually present in Ins.
2393  // So a different index should be used for indexing into Ins.
2394  // See similar issue in LowerCall.
2395  unsigned InsIdx = 0;
2396 
2397  int idx = 0;
2398  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2399  Type *Ty = argTypes[i];
2400 
2401  // If the kernel argument is image*_t or sampler_t, convert it to
2402  // a i32 constant holding the parameter position. This can later
2403  // matched in the AsmPrinter to output the correct mangled name.
2404  if (isImageOrSamplerVal(
2405  theArgs[i],
2406  (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2407  : nullptr))) {
2408  assert(isKernelFunction(*F) &&
2409  "Only kernels can have image/sampler params");
2410  InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2411  continue;
2412  }
2413 
2414  if (theArgs[i]->use_empty()) {
2415  // argument is dead
2416  if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2417  SmallVector<EVT, 16> vtparts;
2418 
2419  ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2420  assert(vtparts.size() > 0 && "empty aggregate type not expected");
2421  for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2422  ++parti) {
2423  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2424  ++InsIdx;
2425  }
2426  if (vtparts.size() > 0)
2427  --InsIdx;
2428  continue;
2429  }
2430  if (Ty->isVectorTy()) {
2431  EVT ObjectVT = getValueType(DL, Ty);
2432  unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2433  for (unsigned parti = 0; parti < NumRegs; ++parti) {
2434  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2435  ++InsIdx;
2436  }
2437  if (NumRegs > 0)
2438  --InsIdx;
2439  continue;
2440  }
2441  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2442  continue;
2443  }
2444 
2445  // In the following cases, assign a node order of "idx+1"
2446  // to newly created nodes. The SDNodes for params have to
2447  // appear in the same order as their order of appearance
2448  // in the original function. "idx+1" holds that order.
2449  if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2450  bool aggregateIsPacked = false;
2451  if (StructType *STy = dyn_cast<StructType>(Ty))
2452  aggregateIsPacked = STy->isPacked();
2453 
2456  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2457  assert(VTs.size() > 0 && "Unexpected empty type.");
2458  auto VectorInfo =
2459  VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2460 
2461  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2462  int VecIdx = -1; // Index of the first element of the current vector.
2463  for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2464  if (VectorInfo[parti] & PVF_FIRST) {
2465  assert(VecIdx == -1 && "Orphaned vector.");
2466  VecIdx = parti;
2467  }
2468 
2469  // That's the last element of this store op.
2470  if (VectorInfo[parti] & PVF_LAST) {
2471  unsigned NumElts = parti - VecIdx + 1;
2472  EVT EltVT = VTs[parti];
2473  // i1 is loaded/stored as i8.
2474  EVT LoadVT = EltVT;
2475  if (EltVT == MVT::i1)
2476  LoadVT = MVT::i8;
2477  else if (EltVT == MVT::v2f16)
2478  // getLoad needs a vector type, but it can't handle
2479  // vectors which contain v2f16 elements. So we must load
2480  // using i32 here and then bitcast back.
2481  LoadVT = MVT::i32;
2482 
2483  EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2484  SDValue VecAddr =
2485  DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2486  DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2488  EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2489  SDValue P =
2490  DAG.getLoad(VecVT, dl, Root, VecAddr,
2491  MachinePointerInfo(srcValue), aggregateIsPacked,
2494  if (P.getNode())
2495  P.getNode()->setIROrder(idx + 1);
2496  for (unsigned j = 0; j < NumElts; ++j) {
2497  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2498  DAG.getIntPtrConstant(j, dl));
2499  // We've loaded i1 as an i8 and now must truncate it back to i1
2500  if (EltVT == MVT::i1)
2501  Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2502  // v2f16 was loaded as an i32. Now we must bitcast it back.
2503  else if (EltVT == MVT::v2f16)
2504  Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2505  // Extend the element if necessary (e.g. an i8 is loaded
2506  // into an i16 register)
2507  if (Ins[InsIdx].VT.isInteger() &&
2508  Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
2509  unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2510  : ISD::ZERO_EXTEND;
2511  Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2512  }
2513  InVals.push_back(Elt);
2514  }
2515 
2516  // Reset vector tracking state.
2517  VecIdx = -1;
2518  }
2519  ++InsIdx;
2520  }
2521  if (VTs.size() > 0)
2522  --InsIdx;
2523  continue;
2524  }
2525 
2526  // Param has ByVal attribute
2527  // Return MoveParam(param symbol).
2528  // Ideally, the param symbol can be returned directly,
2529  // but when SDNode builder decides to use it in a CopyToReg(),
2530  // machine instruction fails because TargetExternalSymbol
2531  // (not lowered) is target dependent, and CopyToReg assumes
2532  // the source is lowered.
2533  EVT ObjectVT = getValueType(DL, Ty);
2534  assert(ObjectVT == Ins[InsIdx].VT &&
2535  "Ins type did not match function type");
2536  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2537  SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2538  if (p.getNode())
2539  p.getNode()->setIROrder(idx + 1);
2540  InVals.push_back(p);
2541  }
2542 
2543  // Clang will check explicit VarArg and issue error if any. However, Clang
2544  // will let code with
2545  // implicit var arg like f() pass. See bug 617733.
2546  // We treat this case as if the arg list is empty.
2547  // if (F.isVarArg()) {
2548  // assert(0 && "VarArg not supported yet!");
2549  //}
2550 
2551  if (!OutChains.empty())
2552  DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2553 
2554  return Chain;
2555 }
2556 
2557 SDValue
2559  bool isVarArg,
2560  const SmallVectorImpl<ISD::OutputArg> &Outs,
2561  const SmallVectorImpl<SDValue> &OutVals,
2562  const SDLoc &dl, SelectionDAG &DAG) const {
2563  MachineFunction &MF = DAG.getMachineFunction();
2564  Type *RetTy = MF.getFunction().getReturnType();
2565 
2566  bool isABI = (STI.getSmVersion() >= 20);
2567  assert(isABI && "Non-ABI compilation is not supported");
2568  if (!isABI)
2569  return Chain;
2570 
2571  const DataLayout DL = DAG.getDataLayout();
2574  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2575  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2576 
2577  auto VectorInfo = VectorizePTXValueVTs(
2578  VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
2579 
2580  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2581  // 32-bits are sign extended or zero extended, depending on whether
2582  // they are signed or unsigned types.
2583  bool ExtendIntegerRetVal =
2584  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2585 
2586  SmallVector<SDValue, 6> StoreOperands;
2587  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2588  // New load/store. Record chain and offset operands.
2589  if (VectorInfo[i] & PVF_FIRST) {
2590  assert(StoreOperands.empty() && "Orphaned operand list.");
2591  StoreOperands.push_back(Chain);
2592  StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2593  }
2594 
2595  SDValue RetVal = OutVals[i];
2596  if (ExtendIntegerRetVal) {
2597  RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2598  : ISD::ZERO_EXTEND,
2599  dl, MVT::i32, RetVal);
2600  } else if (RetVal.getValueSizeInBits() < 16) {
2601  // Use 16-bit registers for small load-stores as it's the
2602  // smallest general purpose register size supported by NVPTX.
2603  RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2604  }
2605 
2606  // Record the value to return.
2607  StoreOperands.push_back(RetVal);
2608 
2609  // That's the last element of this store op.
2610  if (VectorInfo[i] & PVF_LAST) {
2612  unsigned NumElts = StoreOperands.size() - 2;
2613  switch (NumElts) {
2614  case 1:
2615  Op = NVPTXISD::StoreRetval;
2616  break;
2617  case 2:
2619  break;
2620  case 4:
2622  break;
2623  default:
2624  llvm_unreachable("Invalid vector info.");
2625  }
2626 
2627  // Adjust type of load/store op if we've extended the scalar
2628  // return value.
2629  EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2630  Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2631  StoreOperands, TheStoreType,
2632  MachinePointerInfo(), /* Align */ 1,
2634  // Cleanup vector state.
2635  StoreOperands.clear();
2636  }
2637  }
2638 
2639  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2640 }
2641 
2643  SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2644  SelectionDAG &DAG) const {
2645  if (Constraint.length() > 1)
2646  return;
2647  else
2648  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2649 }
2650 
2651 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2652  switch (Intrinsic) {
2653  default:
2654  return 0;
2655 
2656  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2657  return NVPTXISD::Tex1DFloatS32;
2658  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2660  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2662  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2664  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2665  return NVPTXISD::Tex1DS32S32;
2666  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2667  return NVPTXISD::Tex1DS32Float;
2668  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2670  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2672  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2673  return NVPTXISD::Tex1DU32S32;
2674  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2675  return NVPTXISD::Tex1DU32Float;
2676  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2678  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2680 
2681  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2683  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2685  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2687  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2689  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2691  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2693  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2695  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2697  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2699  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2701  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2703  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2705 
2706  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2707  return NVPTXISD::Tex2DFloatS32;
2708  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2710  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2712  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2714  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2715  return NVPTXISD::Tex2DS32S32;
2716  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2717  return NVPTXISD::Tex2DS32Float;
2718  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2720  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2722  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2723  return NVPTXISD::Tex2DU32S32;
2724  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2725  return NVPTXISD::Tex2DU32Float;
2726  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2728  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2730 
2731  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2733  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2735  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2737  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2739  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2741  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2743  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2745  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2747  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2749  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2751  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2753  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2755 
2756  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2757  return NVPTXISD::Tex3DFloatS32;
2758  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2760  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2762  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2764  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2765  return NVPTXISD::Tex3DS32S32;
2766  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2767  return NVPTXISD::Tex3DS32Float;
2768  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2770  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2772  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2773  return NVPTXISD::Tex3DU32S32;
2774  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2775  return NVPTXISD::Tex3DU32Float;
2776  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2778  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2780 
2781  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2783  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2785  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2787  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2789  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2791  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2793 
2794  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2796  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2798  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2800  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2802  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2804  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2806 
2807  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2809  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2811  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2813  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2815  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2817  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2819  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2821  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2823  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2825  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2827  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2829  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2831 
2832  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2834  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2836  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2838  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2840  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2842  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2844  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2846  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2848  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2850  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2852  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2854  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2856 
2857  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2859  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2861  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2863  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2865  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2867  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2869  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2871  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2873  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2875  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2877  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2879  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2881 
2882  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2884  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2886  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2888  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2890  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2892  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2894  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2896  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2898  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2900  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2902  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2904  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2906 
2907  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2909  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2911  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2913  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2915  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
2917  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
2919  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
2921  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
2923  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
2925  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
2927  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
2929  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
2931 
2932  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
2934  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
2936  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
2938  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
2940  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
2942  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
2944  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
2946  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
2948  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
2950  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
2952  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
2954  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
2956 
2957  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
2959  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
2961  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
2963  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
2965  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
2967  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
2969 
2970  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
2972  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
2974  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
2976  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
2978  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
2980  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
2982 
2983  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
2985  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
2987  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
2989  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
2991  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
2993  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
2995  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
2997  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
2999  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3001  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3003  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3005  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3007  }
3008 }
3009 
3010 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3011  switch (Intrinsic) {
3012  default:
3013  return 0;
3014  case Intrinsic::nvvm_suld_1d_i8_clamp:
3015  return NVPTXISD::Suld1DI8Clamp;
3016  case Intrinsic::nvvm_suld_1d_i16_clamp:
3017  return NVPTXISD::Suld1DI16Clamp;
3018  case Intrinsic::nvvm_suld_1d_i32_clamp:
3019  return NVPTXISD::Suld1DI32Clamp;
3020  case Intrinsic::nvvm_suld_1d_i64_clamp:
3021  return NVPTXISD::Suld1DI64Clamp;
3022  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3024  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3026  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3028  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3030  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3032  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3034  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3036  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3038  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3040  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3042  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3044  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3046  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3048  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3050  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3052  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3054  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3056  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3058  case Intrinsic::nvvm_suld_2d_i8_clamp:
3059  return NVPTXISD::Suld2DI8Clamp;
3060  case Intrinsic::nvvm_suld_2d_i16_clamp:
3061  return NVPTXISD::Suld2DI16Clamp;
3062  case Intrinsic::nvvm_suld_2d_i32_clamp:
3063  return NVPTXISD::Suld2DI32Clamp;
3064  case Intrinsic::nvvm_suld_2d_i64_clamp:
3065  return NVPTXISD::Suld2DI64Clamp;
3066  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3068  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3070  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3072  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3074  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3076  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3078  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3080  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3082  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3084  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3086  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3088  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3090  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3092  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3094  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3096  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3098  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3100  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3102  case Intrinsic::nvvm_suld_3d_i8_clamp:
3103  return NVPTXISD::Suld3DI8Clamp;
3104  case Intrinsic::nvvm_suld_3d_i16_clamp:
3105  return NVPTXISD::Suld3DI16Clamp;
3106  case Intrinsic::nvvm_suld_3d_i32_clamp:
3107  return NVPTXISD::Suld3DI32Clamp;
3108  case Intrinsic::nvvm_suld_3d_i64_clamp:
3109  return NVPTXISD::Suld3DI64Clamp;
3110  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3112  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3114  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3116  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3118  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3120  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3122  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3124  case Intrinsic::nvvm_suld_1d_i8_trap:
3125  return NVPTXISD::Suld1DI8Trap;
3126  case Intrinsic::nvvm_suld_1d_i16_trap:
3127  return NVPTXISD::Suld1DI16Trap;
3128  case Intrinsic::nvvm_suld_1d_i32_trap:
3129  return NVPTXISD::Suld1DI32Trap;
3130  case Intrinsic::nvvm_suld_1d_i64_trap:
3131  return NVPTXISD::Suld1DI64Trap;
3132  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3133  return NVPTXISD::Suld1DV2I8Trap;
3134  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3136  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3138  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3140  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3141  return NVPTXISD::Suld1DV4I8Trap;
3142  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3144  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3146  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3148  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3150  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3152  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3154  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3156  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3158  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3160  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3162  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3164  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3166  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3168  case Intrinsic::nvvm_suld_2d_i8_trap:
3169  return NVPTXISD::Suld2DI8Trap;
3170  case Intrinsic::nvvm_suld_2d_i16_trap:
3171  return NVPTXISD::Suld2DI16Trap;
3172  case Intrinsic::nvvm_suld_2d_i32_trap:
3173  return NVPTXISD::Suld2DI32Trap;
3174  case Intrinsic::nvvm_suld_2d_i64_trap:
3175  return NVPTXISD::Suld2DI64Trap;
3176  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3177  return NVPTXISD::Suld2DV2I8Trap;
3178  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3180  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3182  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3184  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3185  return NVPTXISD::Suld2DV4I8Trap;
3186  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3188  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3190  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3192  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3194  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3196  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3198  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3200  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3202  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3204  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3206  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3208  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3210  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3212  case Intrinsic::nvvm_suld_3d_i8_trap:
3213  return NVPTXISD::Suld3DI8Trap;
3214  case Intrinsic::nvvm_suld_3d_i16_trap:
3215  return NVPTXISD::Suld3DI16Trap;
3216  case Intrinsic::nvvm_suld_3d_i32_trap:
3217  return NVPTXISD::Suld3DI32Trap;
3218  case Intrinsic::nvvm_suld_3d_i64_trap:
3219  return NVPTXISD::Suld3DI64Trap;
3220  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3221  return NVPTXISD::Suld3DV2I8Trap;
3222  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3224  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3226  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3228  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3229  return NVPTXISD::Suld3DV4I8Trap;
3230  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3232  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3234  case Intrinsic::nvvm_suld_1d_i8_zero:
3235  return NVPTXISD::Suld1DI8Zero;
3236  case Intrinsic::nvvm_suld_1d_i16_zero:
3237  return NVPTXISD::Suld1DI16Zero;
3238  case Intrinsic::nvvm_suld_1d_i32_zero:
3239  return NVPTXISD::Suld1DI32Zero;
3240  case Intrinsic::nvvm_suld_1d_i64_zero:
3241  return NVPTXISD::Suld1DI64Zero;
3242  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3243  return NVPTXISD::Suld1DV2I8Zero;
3244  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3246  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3248  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3250  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3251  return NVPTXISD::Suld1DV4I8Zero;
3252  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3254  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3256  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3258  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3260  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3262  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3264  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3266  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3268  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3270  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3272  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3274  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3276  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3278  case Intrinsic::nvvm_suld_2d_i8_zero:
3279  return NVPTXISD::Suld2DI8Zero;
3280  case Intrinsic::nvvm_suld_2d_i16_zero:
3281  return NVPTXISD::Suld2DI16Zero;
3282  case Intrinsic::nvvm_suld_2d_i32_zero:
3283  return NVPTXISD::Suld2DI32Zero;
3284  case Intrinsic::nvvm_suld_2d_i64_zero:
3285  return NVPTXISD::Suld2DI64Zero;
3286  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3287  return NVPTXISD::Suld2DV2I8Zero;
3288  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3290  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3292  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3294  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3295  return NVPTXISD::Suld2DV4I8Zero;
3296  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3298  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3300  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3302  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3304  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3306  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3308  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3310  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3312  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3314  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3316  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3318  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3320  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3322  case Intrinsic::nvvm_suld_3d_i8_zero:
3323  return NVPTXISD::Suld3DI8Zero;
3324  case Intrinsic::nvvm_suld_3d_i16_zero:
3325  return NVPTXISD::Suld3DI16Zero;
3326  case Intrinsic::nvvm_suld_3d_i32_zero:
3327  return NVPTXISD::Suld3DI32Zero;
3328  case Intrinsic::nvvm_suld_3d_i64_zero:
3329  return NVPTXISD::Suld3DI64Zero;
3330  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3331  return NVPTXISD::Suld3DV2I8Zero;
3332  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3334  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3336  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3338  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3339  return NVPTXISD::Suld3DV4I8Zero;
3340  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3342  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3344  }
3345 }
3346 
3347 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3348 // TgtMemIntrinsic
3349 // because we need the information that is only available in the "Value" type
3350 // of destination
3351 // pointer. In particular, the address space information.
3353  IntrinsicInfo &Info, const CallInst &I,
3354  MachineFunction &MF, unsigned Intrinsic) const {
3355  switch (Intrinsic) {
3356  default:
3357  return false;
3358  case Intrinsic::nvvm_match_all_sync_i32p:
3359  case Intrinsic::nvvm_match_all_sync_i64p:
3360  Info.opc = ISD::INTRINSIC_W_CHAIN;
3361  // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3362  // in order to model data exchange with other threads, but perform no real
3363  // memory accesses.
3364  Info.memVT = MVT::i1;
3365 
3366  // Our result depends on both our and other thread's arguments.
3368  return true;
3369  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3370  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3371  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3372  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3373  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3374  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3375  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3376  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3377  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3378  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3379  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3380  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3381  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3382  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3383  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3384  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3385  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3386  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3387  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3388  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3389  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3390  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3391  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3392  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3393  Info.opc = ISD::INTRINSIC_W_CHAIN;
3394  Info.memVT = MVT::v8f16;
3395  Info.ptrVal = I.getArgOperand(0);
3396  Info.offset = 0;
3398  Info.align = 16;
3399  return true;
3400  }
3401 
3402  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3403  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3404  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3405  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3406  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3407  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3408  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3409  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3410  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3411  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3412  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3413  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3414  Info.opc = ISD::INTRINSIC_W_CHAIN;
3415  Info.memVT = MVT::v4f16;
3416  Info.ptrVal = I.getArgOperand(0);
3417  Info.offset = 0;
3419  Info.align = 16;
3420  return true;
3421  }
3422 
3423  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3424  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3425  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3426  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3427  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3428  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3429  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3430  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3431  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3432  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3433  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3434  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3435  Info.opc = ISD::INTRINSIC_W_CHAIN;
3436  Info.memVT = MVT::v8f32;
3437  Info.ptrVal = I.getArgOperand(0);
3438  Info.offset = 0;
3440  Info.align = 16;
3441  return true;
3442  }
3443 
3444  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3445  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3446  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3447  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3448  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3449  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3450  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3451  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3452  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3453  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3454  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3455  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3456  Info.opc = ISD::INTRINSIC_VOID;
3457  Info.memVT = MVT::v4f16;
3458  Info.ptrVal = I.getArgOperand(0);
3459  Info.offset = 0;
3461  Info.align = 16;
3462  return true;
3463  }
3464 
3465  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3466  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3467  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3468  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3469  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3470  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3471  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3472  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3473  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3474  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3475  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3476  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3477  Info.opc = ISD::INTRINSIC_VOID;
3478  Info.memVT = MVT::v8f32;
3479  Info.ptrVal = I.getArgOperand(0);
3480  Info.offset = 0;
3482  Info.align = 16;
3483  return true;
3484  }
3485 
3486  case Intrinsic::nvvm_atomic_load_add_f32:
3487  case Intrinsic::nvvm_atomic_load_add_f64:
3488  case Intrinsic::nvvm_atomic_load_inc_32:
3489  case Intrinsic::nvvm_atomic_load_dec_32:
3490 
3491  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3492  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3493  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3494  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3495  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3496  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3497  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3498  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3499  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3500  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3501  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3502  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3503  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3504  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3505  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3506  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3507  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3508  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3509  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3510  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3511  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3512  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3513  auto &DL = I.getModule()->getDataLayout();
3514  Info.opc = ISD::INTRINSIC_W_CHAIN;
3515  Info.memVT = getValueType(DL, I.getType());
3516  Info.ptrVal = I.getArgOperand(0);
3517  Info.offset = 0;
3519  Info.align = 0;
3520  return true;
3521  }
3522 
3523  case Intrinsic::nvvm_ldu_global_i:
3524  case Intrinsic::nvvm_ldu_global_f:
3525  case Intrinsic::nvvm_ldu_global_p: {
3526  auto &DL = I.getModule()->getDataLayout();
3527  Info.opc = ISD::INTRINSIC_W_CHAIN;
3528  if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3529  Info.memVT = getValueType(DL, I.getType());
3530  else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3531  Info.memVT = getPointerTy(DL);
3532  else
3533  Info.memVT = getValueType(DL, I.getType());
3534  Info.ptrVal = I.getArgOperand(0);
3535  Info.offset = 0;
3537  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3538 
3539  return true;
3540  }
3541  case Intrinsic::nvvm_ldg_global_i:
3542  case Intrinsic::nvvm_ldg_global_f:
3543  case Intrinsic::nvvm_ldg_global_p: {
3544  auto &DL = I.getModule()->getDataLayout();
3545 
3546  Info.opc = ISD::INTRINSIC_W_CHAIN;
3547  if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3548  Info.memVT = getValueType(DL, I.getType());
3549  else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3550  Info.memVT = getPointerTy(DL);
3551  else
3552  Info.memVT = getValueType(DL, I.getType());
3553  Info.ptrVal = I.getArgOperand(0);
3554  Info.offset = 0;
3556  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3557 
3558  return true;
3559  }
3560 
3561  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3562  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3563  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3564  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3565  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3566  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3567  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3568  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3569  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3570  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3571  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3572  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3573  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3574  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3575  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3576  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3577  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3578  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3579  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3580  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3581  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3582  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3583  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3584  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3585  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3586  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3587  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3588  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3589  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3590  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3591  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3592  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3593  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3594  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3595  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3596  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3597  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3598  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3599  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3600  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3601  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3602  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3603  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3604  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3605  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3606  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3607  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3608  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3609  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3610  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3611  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3612  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3613  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3614  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3615  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3616  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3617  Info.opc = getOpcForTextureInstr(Intrinsic);
3618  Info.memVT = MVT::v4f32;
3619  Info.ptrVal = nullptr;
3620  Info.offset = 0;
3622  Info.align = 16;
3623  return true;
3624 
3625  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3626  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3627  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3628  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3629  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3630  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3631  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3632  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3633  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3634  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3635  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3636  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3637  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3638  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3639  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3640  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3641  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3642  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3643  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3644  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3645  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3646  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3647  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3648  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3649  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3650  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3651  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3652  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3653  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3654  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3655  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3656  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3657  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3658  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3659  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3660  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3661  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3662  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3663  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3664  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3665  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3666  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3667  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3668  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3669  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3670  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3671  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3672  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3673  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3674  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3675  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3676  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3677  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3678  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3679  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3680  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3681  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3682  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3683  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3684  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3685  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3686  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3687  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3688  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3689  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3690  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3691  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3692  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3693  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3694  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3695  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3696  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3697  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3698  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3699  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3700  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3701  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3702  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3703  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3704  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3705  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3706  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3707  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3708  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3709  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3710  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3711  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3712  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3713  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3714  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3715  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3716  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3717  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3718  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3719  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3720  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3721  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3722  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3723  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3724  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3725  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3726  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3727  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3728  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3729  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3730  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3731  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3732  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3733  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3734  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3735  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3736  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3737  Info.opc = getOpcForTextureInstr(Intrinsic);
3738  Info.memVT = MVT::v4i32;
3739  Info.ptrVal = nullptr;
3740  Info.offset = 0;
3742  Info.align = 16;
3743  return true;
3744 
3745  case Intrinsic::nvvm_suld_1d_i8_clamp:
3746  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3747  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3748  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3749  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3750  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3751  case Intrinsic::nvvm_suld_2d_i8_clamp:
3752  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3753  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3754  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3755  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3756  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3757  case Intrinsic::nvvm_suld_3d_i8_clamp:
3758  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3759  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3760  case Intrinsic::nvvm_suld_1d_i8_trap:
3761  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3762  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3763  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3764  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3765  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3766  case Intrinsic::nvvm_suld_2d_i8_trap:
3767  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3768  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3769  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3770  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3771  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3772  case Intrinsic::nvvm_suld_3d_i8_trap:
3773  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3774  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3775  case Intrinsic::nvvm_suld_1d_i8_zero:
3776  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3777  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3778  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3779  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3780  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3781  case Intrinsic::nvvm_suld_2d_i8_zero:
3782  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3783  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3784  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3785  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3786  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3787  case Intrinsic::nvvm_suld_3d_i8_zero:
3788  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3789  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3790  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3791  Info.memVT = MVT::i8;
3792  Info.ptrVal = nullptr;
3793  Info.offset = 0;
3795  Info.align = 16;
3796  return true;
3797 
3798  case Intrinsic::nvvm_suld_1d_i16_clamp:
3799  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3800  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3801  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3802  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3803  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3804  case Intrinsic::nvvm_suld_2d_i16_clamp:
3805  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3806  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3807  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3808  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3809  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3810  case Intrinsic::nvvm_suld_3d_i16_clamp:
3811  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3812  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3813  case Intrinsic::nvvm_suld_1d_i16_trap:
3814  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3815  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3816  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3817  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3818  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3819  case Intrinsic::nvvm_suld_2d_i16_trap:
3820  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3821  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3822  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3823  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3824  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3825  case Intrinsic::nvvm_suld_3d_i16_trap:
3826  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3827  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3828  case Intrinsic::nvvm_suld_1d_i16_zero:
3829  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3830  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3831  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3832  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3833  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3834  case Intrinsic::nvvm_suld_2d_i16_zero:
3835  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3836  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3837  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3838  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3839  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3840  case Intrinsic::nvvm_suld_3d_i16_zero:
3841  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3842  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3843  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3844  Info.memVT = MVT::i16;
3845  Info.ptrVal = nullptr;
3846  Info.offset = 0;
3848  Info.align = 16;
3849  return true;
3850 
3851  case Intrinsic::nvvm_suld_1d_i32_clamp:
3852  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3853  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3854  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3855  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3856  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3857  case Intrinsic::nvvm_suld_2d_i32_clamp:
3858  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3859  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3860  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3861  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3862  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3863  case Intrinsic::nvvm_suld_3d_i32_clamp:
3864  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3865  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3866  case Intrinsic::nvvm_suld_1d_i32_trap:
3867  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3868  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3869  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3870  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3871  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3872  case Intrinsic::nvvm_suld_2d_i32_trap:
3873  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3874  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3875  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3876  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3877  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3878  case Intrinsic::nvvm_suld_3d_i32_trap:
3879  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3880  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3881  case Intrinsic::nvvm_suld_1d_i32_zero:
3882  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3883  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3884  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3885  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3886  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3887  case Intrinsic::nvvm_suld_2d_i32_zero:
3888  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3889  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3890  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3891  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3892  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3893  case Intrinsic::nvvm_suld_3d_i32_zero:
3894  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3895  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3896  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3897  Info.memVT = MVT::i32;
3898  Info.ptrVal = nullptr;
3899  Info.offset = 0;
3901  Info.align = 16;
3902  return true;
3903 
3904  case Intrinsic::nvvm_suld_1d_i64_clamp:
3905  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3906  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3907  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3908  case Intrinsic::nvvm_suld_2d_i64_clamp:
3909  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3910  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3911  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3912  case Intrinsic::nvvm_suld_3d_i64_clamp:
3913  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3914  case Intrinsic::nvvm_suld_1d_i64_trap:
3915  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3916  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3917  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3918  case Intrinsic::nvvm_suld_2d_i64_trap:
3919  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3920  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3921  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3922  case Intrinsic::nvvm_suld_3d_i64_trap:
3923  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3924  case Intrinsic::nvvm_suld_1d_i64_zero:
3925  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3926  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3927  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3928  case Intrinsic::nvvm_suld_2d_i64_zero:
3929  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3930  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3931  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3932  case Intrinsic::nvvm_suld_3d_i64_zero:
3933  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3934  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3935  Info.memVT = MVT::i64;
3936  Info.ptrVal = nullptr;
3937  Info.offset = 0;
3939  Info.align = 16;
3940  return true;
3941  }
3942  return false;
3943 }
3944 
3945 /// isLegalAddressingMode - Return true if the addressing mode represented
3946 /// by AM is legal for this target, for a load/store of the specified type.
3947 /// Used to guide target specific optimizations, like loop strength reduction
3948 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
3949 /// (CodeGenPrepare.cpp)
3951  const AddrMode &AM, Type *Ty,
3952  unsigned AS, Instruction *I) const {
3953  // AddrMode - This represents an addressing mode of:
3954  // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
3955  //
3956  // The legal address modes are
3957  // - [avar]
3958  // - [areg]
3959  // - [areg+immoff]
3960  // - [immAddr]
3961 
3962  if (AM.BaseGV) {
3963  return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
3964  }
3965 
3966  switch (AM.Scale) {
3967  case 0: // "r", "r+i" or "i" is allowed
3968  break;
3969  case 1:
3970  if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
3971  return false;
3972  // Otherwise we have r+i.
3973  break;
3974  default:
3975  // No scale > 1 is allowed
3976  return false;
3977  }
3978  return true;
3979 }
3980 
3981 //===----------------------------------------------------------------------===//
3982 // NVPTX Inline Assembly Support
3983 //===----------------------------------------------------------------------===//
3984 
3985 /// getConstraintType - Given a constraint letter, return the type of
3986 /// constraint it is for this target.
3989  if (Constraint.size() == 1) {
3990  switch (Constraint[0]) {
3991  default:
3992  break;
3993  case 'b':
3994  case 'r':
3995  case 'h':
3996  case 'c':
3997  case 'l':
3998  case 'f':
3999  case 'd':
4000  case '0':
4001  case 'N':
4002  return C_RegisterClass;
4003  }
4004  }
4005  return TargetLowering::getConstraintType(Constraint);
4006 }
4007 
4008 std::pair<unsigned, const TargetRegisterClass *>
4010  StringRef Constraint,
4011  MVT VT) const {
4012  if (Constraint.size() == 1) {
4013  switch (Constraint[0]) {
4014  case 'b':
4015  return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4016  case 'c':
4017  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4018  case 'h':
4019  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4020  case 'r':
4021  return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4022  case 'l':
4023  case 'N':
4024  return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4025  case 'f':
4026  return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4027  case 'd':
4028  return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4029  }
4030  }
4031  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4032 }
4033 
4034 //===----------------------------------------------------------------------===//
4035 // NVPTX DAG Combining
4036 //===----------------------------------------------------------------------===//
4037 
4039  CodeGenOpt::Level OptLevel) const {
4040  // Always honor command-line argument
4041  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4042  return FMAContractLevelOpt > 0;
4043 
4044  // Do not contract if we're not optimizing the code.
4045  if (OptLevel == 0)
4046  return false;
4047 
4048  // Honor TargetOptions flags that explicitly say fusion is okay.
4050  return true;
4051 
4052  return allowUnsafeFPMath(MF);
4053 }
4054 
4056  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4057  if (MF.getTarget().Options.UnsafeFPMath)
4058  return true;
4059 
4060  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4061  const Function &F = MF.getFunction();
4062  if (F.hasFnAttribute("unsafe-fp-math")) {
4063  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4064  StringRef Val = Attr.getValueAsString();
4065  if (Val == "true")
4066  return true;
4067  }
4068 
4069  return false;
4070 }
4071 
4072 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4073 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4074 /// called with the default operands, and if that fails, with commuted
4075 /// operands.
4078  const NVPTXSubtarget &Subtarget,
4079  CodeGenOpt::Level OptLevel) {
4080  SelectionDAG &DAG = DCI.DAG;
4081  // Skip non-integer, non-scalar case
4082  EVT VT=N0.getValueType();
4083  if (VT.isVector())
4084  return SDValue();
4085 
4086  // fold (add (mul a, b), c) -> (mad a, b, c)
4087  //
4088  if (N0.getOpcode() == ISD::MUL) {
4089  assert (VT.isInteger());
4090  // For integer:
4091  // Since integer multiply-add costs the same as integer multiply
4092  // but is more costly than integer add, do the fusion only when
4093  // the mul is only used in the add.
4094  if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4095  !N0.getNode()->hasOneUse())
4096  return SDValue();
4097 
4098  // Do the folding
4099  return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4100  N0.getOperand(0), N0.getOperand(1), N1);
4101  }
4102  else if (N0.getOpcode() == ISD::FMUL) {
4103  if (VT == MVT::f32 || VT == MVT::f64) {
4104  const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4105  &DAG.getTargetLoweringInfo());
4106  if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4107  return SDValue();
4108 
4109  // For floating point:
4110  // Do the fusion only when the mul has less than 5 uses and all
4111  // are add.
4112  // The heuristic is that if a use is not an add, then that use
4113  // cannot be fused into fma, therefore mul is still needed anyway.
4114  // If there are more than 4 uses, even if they are all add, fusing
4115  // them will increase register pressue.
4116  //
4117  int numUses = 0;
4118  int nonAddCount = 0;
4119  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4120  UE = N0.getNode()->use_end();
4121  UI != UE; ++UI) {
4122  numUses++;
4123  SDNode *User = *UI;
4124  if (User->getOpcode() != ISD::FADD)
4125  ++nonAddCount;
4126  }
4127  if (numUses >= 5)
4128  return SDValue();
4129  if (nonAddCount) {
4130  int orderNo = N->getIROrder();
4131  int orderNo2 = N0.getNode()->getIROrder();
4132  // simple heuristics here for considering potential register
4133  // pressure, the logics here is that the differnce are used
4134  // to measure the distance between def and use, the longer distance
4135  // more likely cause register pressure.
4136  if (orderNo - orderNo2 < 500)
4137  return SDValue();
4138 
4139  // Now, check if at least one of the FMUL's operands is live beyond the node N,
4140  // which guarantees that the FMA will not increase register pressure at node N.
4141  bool opIsLive = false;
4142  const SDNode *left = N0.getOperand(0).getNode();
4143  const SDNode *right = N0.getOperand(1).getNode();
4144 
4145  if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4146  opIsLive = true;
4147 
4148  if (!opIsLive)
4149  for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4150  SDNode *User = *UI;
4151  int orderNo3 = User->getIROrder();
4152  if (orderNo3 > orderNo) {
4153  opIsLive = true;
4154  break;
4155  }
4156  }
4157 
4158  if (!opIsLive)
4159  for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4160  SDNode *User = *UI;
4161  int orderNo3 = User->getIROrder();
4162  if (orderNo3 > orderNo) {
4163  opIsLive = true;
4164  break;
4165  }
4166  }
4167 
4168  if (!opIsLive)
4169  return SDValue();
4170  }
4171 
4172  return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4173  N0.getOperand(0), N0.getOperand(1), N1);
4174  }
4175  }
4176 
4177  return SDValue();
4178 }
4179 
4180 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4181 ///
4184  const NVPTXSubtarget &Subtarget,
4185  CodeGenOpt::Level OptLevel) {
4186  SDValue N0 = N->getOperand(0);
4187  SDValue N1 = N->getOperand(1);
4188 
4189  // First try with the default operand order.
4190  if (SDValue Result =
4191  PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4192  return Result;
4193 
4194  // If that didn't work, try again with the operands commuted.
4195  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4196 }
4197 
4200  // The type legalizer turns a vector load of i8 values into a zextload to i16
4201  // registers, optionally ANY_EXTENDs it (if target type is integer),
4202  // and ANDs off the high 8 bits. Since we turn this load into a
4203  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4204  // nodes. Do that here.
4205  SDValue Val = N->getOperand(0);
4206  SDValue Mask = N->getOperand(1);
4207 
4208  if (isa<ConstantSDNode>(Val)) {
4209  std::swap(Val, Mask);
4210  }
4211 
4212  SDValue AExt;
4213  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4214  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4215  AExt = Val;
4216  Val = Val->getOperand(0);
4217  }
4218 
4219  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4220  Val = Val->getOperand(0);
4221  }
4222 
4223  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4224  Val->getOpcode() == NVPTXISD::LoadV4) {
4226  if (!MaskCnst) {
4227  // Not an AND with a constant
4228  return SDValue();
4229  }
4230 
4231  uint64_t MaskVal = MaskCnst->getZExtValue();
4232  if (MaskVal != 0xff) {
4233  // Not an AND that chops off top 8 bits
4234  return SDValue();
4235  }
4236 
4237  MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4238  if (!Mem) {
4239  // Not a MemSDNode?!?
4240  return SDValue();
4241  }
4242 
4243  EVT MemVT = Mem->getMemoryVT();
4244  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4245  // We only handle the i8 case
4246  return SDValue();
4247  }
4248 
4249  unsigned ExtType =
4250  cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4251  getZExtValue();
4252  if (ExtType == ISD::SEXTLOAD) {
4253  // If for some reason the load is a sextload, the and is needed to zero
4254  // out the high 8 bits
4255  return SDValue();
4256  }
4257 
4258  bool AddTo = false;
4259  if (AExt.getNode() != nullptr) {
4260  // Re-insert the ext as a zext.
4261  Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4262  AExt.getValueType(), Val);
4263  AddTo = true;
4264  }
4265 
4266  // If we get here, the AND is unnecessary. Just replace it with the load
4267  DCI.CombineTo(N, Val, AddTo);
4268  }
4269 
4270  return SDValue();
4271 }
4272 
4275  CodeGenOpt::Level OptLevel) {
4276  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4277 
4278  // Don't do anything at less than -O2.
4279  if (OptLevel < CodeGenOpt::Default)
4280  return SDValue();
4281 
4282  SelectionDAG &DAG = DCI.DAG;
4283  SDLoc DL(N);
4284  EVT VT = N->getValueType(0);
4285  bool IsSigned = N->getOpcode() == ISD::SREM;
4286  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4287 
4288  const SDValue &Num = N->getOperand(0);
4289  const SDValue &Den = N->getOperand(1);
4290 
4291  for (const SDNode *U : Num->uses()) {
4292  if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4293  U->getOperand(1) == Den) {
4294  // Num % Den -> Num - (Num / Den) * Den
4295  return DAG.getNode(ISD::SUB, DL, VT, Num,
4296  DAG.getNode(ISD::MUL, DL, VT,
4297  DAG.getNode(DivOpc, DL, VT, Num, Den),
4298  Den));
4299  }
4300  }
4301  return SDValue();
4302 }
4303 
4305  Signed = 0,
4308 };
4309 
4310 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4311 /// that can be demoted to \p OptSize bits without loss of information. The
4312 /// signedness of the operand, if determinable, is placed in \p S.
4314  unsigned OptSize,
4315  OperandSignedness &S) {
4316  S = Unknown;
4317 
4318  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4320  EVT OrigVT = Op.getOperand(0).getValueType();
4321  if (OrigVT.getSizeInBits() <= OptSize) {
4322  S = Signed;
4323  return true;
4324  }
4325  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4326  EVT OrigVT = Op.getOperand(0).getValueType();
4327  if (OrigVT.getSizeInBits() <= OptSize) {
4328  S = Unsigned;
4329  return true;
4330  }
4331  }
4332 
4333  return false;
4334 }
4335 
4336 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4337 /// be demoted to \p OptSize bits without loss of information. If the operands
4338 /// contain a constant, it should appear as the RHS operand. The signedness of
4339 /// the operands is placed in \p IsSigned.
4341  unsigned OptSize,
4342  bool &IsSigned) {
4343  OperandSignedness LHSSign;
4344 
4345  // The LHS operand must be a demotable op
4346  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4347  return false;
4348 
4349  // We should have been able to determine the signedness from the LHS
4350  if (LHSSign == Unknown)
4351  return false;
4352 
4353  IsSigned = (LHSSign == Signed);
4354 
4355  // The RHS can be a demotable op or a constant
4356  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4357  const APInt &Val = CI->getAPIntValue();
4358  if (LHSSign == Unsigned) {
4359  return Val.isIntN(OptSize);
4360  } else {
4361  return Val.isSignedIntN(OptSize);
4362  }
4363  } else {
4364  OperandSignedness RHSSign;
4365  if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4366  return false;
4367 
4368  return LHSSign == RHSSign;
4369  }
4370 }
4371 
4372 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4373 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4374 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4375 /// amount.
4378  EVT MulType = N->getValueType(0);
4379  if (MulType != MVT::i32 && MulType != MVT::i64) {
4380  return SDValue();
4381  }
4382 
4383  SDLoc DL(N);
4384  unsigned OptSize = MulType.getSizeInBits() >> 1;
4385  SDValue LHS = N->getOperand(0);
4386  SDValue RHS = N->getOperand(1);
4387 
4388  // Canonicalize the multiply so the constant (if any) is on the right
4389  if (N->getOpcode() == ISD::MUL) {
4390  if (isa<ConstantSDNode>(LHS)) {
4391  std::swap(LHS, RHS);
4392  }
4393  }
4394 
4395  // If we have a SHL, determine the actual multiply amount
4396  if (N->getOpcode() == ISD::SHL) {
4397  ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4398  if (!ShlRHS) {
4399  return SDValue();
4400  }
4401 
4402  APInt ShiftAmt = ShlRHS->getAPIntValue();
4403  unsigned BitWidth = MulType.getSizeInBits();
4404  if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4405  APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4406  RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4407  } else {
4408  return SDValue();
4409  }
4410  }
4411 
4412  bool Signed;
4413  // Verify that our operands are demotable
4414  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4415  return SDValue();
4416  }
4417 
4418  EVT DemotedVT;
4419  if (MulType == MVT::i32) {
4420  DemotedVT = MVT::i16;
4421  } else {
4422  DemotedVT = MVT::i32;
4423  }
4424 
4425  // Truncate the operands to the correct size. Note that these are just for
4426  // type consistency and will (likely) be eliminated in later phases.
4427  SDValue TruncLHS =
4428  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4429  SDValue TruncRHS =
4430  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4431 
4432  unsigned Opc;
4433  if (Signed) {
4435  } else {
4437  }
4438 
4439  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4440 }
4441 
4442 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4445  CodeGenOpt::Level OptLevel) {
4446  if (OptLevel > 0) {
4447  // Try mul.wide combining at OptLevel > 0
4448  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4449  return Ret;
4450  }
4451 
4452  return SDValue();
4453 }
4454 
4455 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4458  CodeGenOpt::Level OptLevel) {
4459  if (OptLevel > 0) {
4460  // Try mul.wide combining at OptLevel > 0
4461  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4462  return Ret;
4463  }
4464 
4465  return SDValue();
4466 }
4467 
4470  EVT CCType = N->getValueType(0);
4471  SDValue A = N->getOperand(0);
4472  SDValue B = N->getOperand(1);
4473 
4474  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
4475  return SDValue();
4476 
4477  SDLoc DL(N);
4478  // setp.f16x2 returns two scalar predicates, which we need to
4479  // convert back to v2i1. The returned result will be scalarized by
4480  // the legalizer, but the comparison will remain a single vector
4481  // instruction.
4482  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
4483  DCI.DAG.getVTList(MVT::i1, MVT::i1),
4484  {A, B, N->getOperand(2)});
4485  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
4486  CCNode.getValue(1));
4487 }
4488 
4489 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4490  DAGCombinerInfo &DCI) const {
4492  switch (N->getOpcode()) {
4493  default: break;
4494  case ISD::ADD:
4495  case ISD::FADD:
4496  return PerformADDCombine(N, DCI, STI, OptLevel);
4497  case ISD::MUL:
4498  return PerformMULCombine(N, DCI, OptLevel);
4499  case ISD::SHL:
4500  return PerformSHLCombine(N, DCI, OptLevel);
4501  case ISD::AND:
4502  return PerformANDCombine(N, DCI);
4503  case ISD::UREM:
4504  case ISD::SREM:
4505  return PerformREMCombine(N, DCI, OptLevel);
4506  case ISD::SETCC:
4507  return PerformSETCCCombine(N, DCI);
4508  }
4509  return SDValue();
4510 }
4511 
4512 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4515  EVT ResVT = N->getValueType(0);
4516  SDLoc DL(N);
4517 
4518  assert(ResVT.isVector() && "Vector load must have vector type");
4519 
4520  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4521  // legal. We can (and should) split that into 2 loads of <2 x double> here
4522  // but I'm leaving that as a TODO for now.
4523  assert(ResVT.isSimple() && "Can only handle simple types");
4524  switch (ResVT.getSimpleVT().SimpleTy) {
4525  default:
4526  return;
4527  case MVT::v2i8:
4528  case MVT::v2i16:
4529  case MVT::v2i32:
4530  case MVT::v2i64:
4531  case MVT::v2f16:
4532  case MVT::v2f32:
4533  case MVT::v2f64:
4534  case MVT::v4i8:
4535  case MVT::v4i16:
4536  case MVT::v4i32:
4537  case MVT::v4f16:
4538  case MVT::v4f32:
4539  case MVT::v8f16: // <4 x f16x2>
4540  // This is a "native" vector type
4541  break;
4542  }
4543 
4544  LoadSDNode *LD = cast<LoadSDNode>(N);
4545 
4546  unsigned Align = LD->getAlignment();
4547  auto &TD = DAG.getDataLayout();
4548  unsigned PrefAlign =
4549  TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4550  if (Align < PrefAlign) {
4551  // This load is not sufficiently aligned, so bail out and let this vector
4552  // load be scalarized. Note that we may still be able to emit smaller
4553  // vector loads. For example, if we are loading a <4 x float> with an
4554  // alignment of 8, this check will fail but the legalizer will try again
4555  // with 2 x <2 x float>, which will succeed with an alignment of 8.
4556  return;
4557  }
4558 
4559  EVT EltVT = ResVT.getVectorElementType();
4560  unsigned NumElts = ResVT.getVectorNumElements();
4561 
4562  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4563  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4564  // loaded type to i16 and propagate the "real" type as the memory type.
4565  bool NeedTrunc = false;
4566  if (EltVT.getSizeInBits() < 16) {
4567  EltVT = MVT::i16;
4568  NeedTrunc = true;
4569  }
4570 
4571  unsigned Opcode = 0;
4572  SDVTList LdResVTs;
4573  bool LoadF16x2 = false;
4574 
4575  switch (NumElts) {
4576  default:
4577  return;
4578  case 2:
4579  Opcode = NVPTXISD::LoadV2;
4580  LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4581  break;
4582  case 4: {
4583  Opcode = NVPTXISD::LoadV4;
4584  EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4585  LdResVTs = DAG.getVTList(ListVTs);
4586  break;
4587  }
4588  case 8: {
4589  // v8f16 is a special case. PTX doesn't have ld.v8.f16
4590  // instruction. Instead, we split the vector into v2f16 chunks and
4591  // load them with ld.v4.b32.
4592  assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
4593  LoadF16x2 = true;
4594  Opcode = NVPTXISD::LoadV4;
4595  EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
4596  MVT::Other};
4597  LdResVTs = DAG.getVTList(ListVTs);
4598  break;
4599  }
4600  }
4601 
4602  // Copy regular operands
4603  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4604 
4605  // The select routine does not have access to the LoadSDNode instance, so
4606  // pass along the extension information
4607  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4608 
4609  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4610  LD->getMemoryVT(),
4611  LD->getMemOperand());
4612 
4613  SmallVector<SDValue, 8> ScalarRes;
4614  if (LoadF16x2) {
4615  // Split v2f16 subvectors back into individual elements.
4616  NumElts /= 2;
4617  for (unsigned i = 0; i < NumElts; ++i) {
4618  SDValue SubVector = NewLD.getValue(i);
4619  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4620  DAG.getIntPtrConstant(0, DL));
4621  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4622  DAG.getIntPtrConstant(1, DL));
4623  ScalarRes.push_back(E0);
4624  ScalarRes.push_back(E1);
4625  }
4626  } else {
4627  for (unsigned i = 0; i < NumElts; ++i) {
4628  SDValue Res = NewLD.getValue(i);
4629  if (NeedTrunc)
4630  Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4631  ScalarRes.push_back(Res);
4632  }
4633  }
4634 
4635  SDValue LoadChain = NewLD.getValue(NumElts);
4636 
4637  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
4638 
4639  Results.push_back(BuildVec);
4640  Results.push_back(LoadChain);
4641 }
4642 
4645  SDValue Chain = N->getOperand(0);
4646  SDValue Intrin = N->getOperand(1);
4647  SDLoc DL(N);
4648 
4649  // Get the intrinsic ID
4650  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4651  switch (IntrinNo) {
4652  default:
4653  return;
4654  case Intrinsic::nvvm_ldg_global_i:
4655  case Intrinsic::nvvm_ldg_global_f:
4656  case Intrinsic::nvvm_ldg_global_p:
4657  case Intrinsic::nvvm_ldu_global_i: