LLVM  10.0.0svn
NVPTXISelLowering.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/Analysis.h"
32 #include "llvm/IR/Argument.h"
33 #include "llvm/IR/Attributes.h"
34 #include "llvm/IR/CallSite.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/GlobalValue.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Support/CodeGen.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <iterator>
58 #include <sstream>
59 #include <string>
60 #include <utility>
61 #include <vector>
62 
63 #define DEBUG_TYPE "nvptx-lower"
64 
65 using namespace llvm;
66 
67 static unsigned int uniqueCallSite = 0;
68 
70  "nvptx-sched4reg",
71  cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
72 
73 static cl::opt<unsigned>
75  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
76  " 1: do it 2: do it aggressively"),
77  cl::init(2));
78 
80  "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
81  cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
82  " IEEE Compliant F32 div.rnd if available."),
83  cl::init(2));
84 
86  "nvptx-prec-sqrtf32", cl::Hidden,
87  cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
88  cl::init(true));
89 
91  "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
92  cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
93  cl::init(false));
94 
96  if (UsePrecDivF32.getNumOccurrences() > 0) {
97  // If nvptx-prec-div32=N is used on the command-line, always honor it
98  return UsePrecDivF32;
99  } else {
100  // Otherwise, use div.approx if fast math is enabled
102  return 0;
103  else
104  return 2;
105  }
106 }
107 
109  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
110  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
111  return UsePrecSqrtF32;
112  } else {
113  // Otherwise, use sqrt.approx if fast math is enabled
115  }
116 }
117 
119  // TODO: Get rid of this flag; there can be only one way to do this.
120  if (FtzEnabled.getNumOccurrences() > 0) {
121  // If nvptx-f32ftz is used on the command-line, always honor it
122  return FtzEnabled;
123  } else {
124  const Function &F = MF.getFunction();
125  // Otherwise, check for an nvptx-f32ftz attribute on the function
126  if (F.hasFnAttribute("nvptx-f32ftz"))
127  return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
128  else
129  return false;
130  }
131 }
132 
133 static bool IsPTXVectorType(MVT VT) {
134  switch (VT.SimpleTy) {
135  default:
136  return false;
137  case MVT::v2i1:
138  case MVT::v4i1:
139  case MVT::v2i8:
140  case MVT::v4i8:
141  case MVT::v2i16:
142  case MVT::v4i16:
143  case MVT::v2i32:
144  case MVT::v4i32:
145  case MVT::v2i64:
146  case MVT::v2f16:
147  case MVT::v4f16:
148  case MVT::v8f16: // <4 x f16x2>
149  case MVT::v2f32:
150  case MVT::v4f32:
151  case MVT::v2f64:
152  return true;
153  }
154 }
155 
156 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
157 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
158 /// into their primitive components.
159 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
160 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
161 /// LowerCall, and LowerReturn.
162 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
163  Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
165  uint64_t StartingOffset = 0) {
166  SmallVector<EVT, 16> TempVTs;
167  SmallVector<uint64_t, 16> TempOffsets;
168 
169  // Special case for i128 - decompose to (i64, i64)
170  if (Ty->isIntegerTy(128)) {
171  ValueVTs.push_back(EVT(MVT::i64));
172  ValueVTs.push_back(EVT(MVT::i64));
173 
174  if (Offsets) {
175  Offsets->push_back(StartingOffset + 0);
176  Offsets->push_back(StartingOffset + 8);
177  }
178 
179  return;
180  }
181 
182  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
183  if (StructType *STy = dyn_cast<StructType>(Ty)) {
184  auto const *SL = DL.getStructLayout(STy);
185  auto ElementNum = 0;
186  for(auto *EI : STy->elements()) {
187  ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
188  StartingOffset + SL->getElementOffset(ElementNum));
189  ++ElementNum;
190  }
191  return;
192  }
193 
194  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
195  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
196  EVT VT = TempVTs[i];
197  uint64_t Off = TempOffsets[i];
198  // Split vectors into individual elements, except for v2f16, which
199  // we will pass as a single scalar.
200  if (VT.isVector()) {
201  unsigned NumElts = VT.getVectorNumElements();
202  EVT EltVT = VT.getVectorElementType();
203  // Vectors with an even number of f16 elements will be passed to
204  // us as an array of v2f16 elements. We must match this so we
205  // stay in sync with Ins/Outs.
206  if (EltVT == MVT::f16 && NumElts % 2 == 0) {
207  EltVT = MVT::v2f16;
208  NumElts /= 2;
209  }
210  for (unsigned j = 0; j != NumElts; ++j) {
211  ValueVTs.push_back(EltVT);
212  if (Offsets)
213  Offsets->push_back(Off + j * EltVT.getStoreSize());
214  }
215  } else {
216  ValueVTs.push_back(VT);
217  if (Offsets)
218  Offsets->push_back(Off);
219  }
220  }
221 }
222 
223 // Check whether we can merge loads/stores of some of the pieces of a
224 // flattened function parameter or return value into a single vector
225 // load/store.
226 //
227 // The flattened parameter is represented as a list of EVTs and
228 // offsets, and the whole structure is aligned to ParamAlignment. This
229 // function determines whether we can load/store pieces of the
230 // parameter starting at index Idx using a single vectorized op of
231 // size AccessSize. If so, it returns the number of param pieces
232 // covered by the vector op. Otherwise, it returns 1.
234  unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
235  const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
236  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
237 
238  // Can't vectorize if param alignment is not sufficient.
239  if (AccessSize > ParamAlignment)
240  return 1;
241  // Can't vectorize if offset is not aligned.
242  if (Offsets[Idx] & (AccessSize - 1))
243  return 1;
244 
245  EVT EltVT = ValueVTs[Idx];
246  unsigned EltSize = EltVT.getStoreSize();
247 
248  // Element is too large to vectorize.
249  if (EltSize >= AccessSize)
250  return 1;
251 
252  unsigned NumElts = AccessSize / EltSize;
253  // Can't vectorize if AccessBytes if not a multiple of EltSize.
254  if (AccessSize != EltSize * NumElts)
255  return 1;
256 
257  // We don't have enough elements to vectorize.
258  if (Idx + NumElts > ValueVTs.size())
259  return 1;
260 
261  // PTX ISA can only deal with 2- and 4-element vector ops.
262  if (NumElts != 4 && NumElts != 2)
263  return 1;
264 
265  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
266  // Types do not match.
267  if (ValueVTs[j] != EltVT)
268  return 1;
269 
270  // Elements are not contiguous.
271  if (Offsets[j] - Offsets[j - 1] != EltSize)
272  return 1;
273  }
274  // OK. We can vectorize ValueVTs[i..i+NumElts)
275  return NumElts;
276 }
277 
278 // Flags for tracking per-element vectorization state of loads/stores
279 // of a flattened function parameter or return value.
281  PVF_INNER = 0x0, // Middle elements of a vector.
282  PVF_FIRST = 0x1, // First element of the vector.
283  PVF_LAST = 0x2, // Last element of the vector.
284  // Scalar is effectively a 1-element vector.
286 };
287 
288 // Computes whether and how we can vectorize the loads/stores of a
289 // flattened function parameter or return value.
290 //
291 // The flattened parameter is represented as the list of ValueVTs and
292 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
293 // of the same size as ValueVTs indicating how each piece should be
294 // loaded/stored (i.e. as a scalar, or as part of a vector
295 // load/store).
299  unsigned ParamAlignment) {
300  // Set vector size to match ValueVTs and mark all elements as
301  // scalars by default.
303  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
304 
305  // Check what we can vectorize using 128/64/32-bit accesses.
306  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
307  // Skip elements we've already processed.
308  assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
309  for (unsigned AccessSize : {16, 8, 4, 2}) {
310  unsigned NumElts = CanMergeParamLoadStoresStartingAt(
311  I, AccessSize, ValueVTs, Offsets, ParamAlignment);
312  // Mark vectorized elements.
313  switch (NumElts) {
314  default:
315  llvm_unreachable("Unexpected return value");
316  case 1:
317  // Can't vectorize using this size, try next smaller size.
318  continue;
319  case 2:
320  assert(I + 1 < E && "Not enough elements.");
321  VectorInfo[I] = PVF_FIRST;
322  VectorInfo[I + 1] = PVF_LAST;
323  I += 1;
324  break;
325  case 4:
326  assert(I + 3 < E && "Not enough elements.");
327  VectorInfo[I] = PVF_FIRST;
328  VectorInfo[I + 1] = PVF_INNER;
329  VectorInfo[I + 2] = PVF_INNER;
330  VectorInfo[I + 3] = PVF_LAST;
331  I += 3;
332  break;
333  }
334  // Break out of the inner loop because we've already succeeded
335  // using largest possible AccessSize.
336  break;
337  }
338  }
339  return VectorInfo;
340 }
341 
342 // NVPTXTargetLowering Constructor.
344  const NVPTXSubtarget &STI)
345  : TargetLowering(TM), nvTM(&TM), STI(STI) {
346  // always lower memset, memcpy, and memmove intrinsics to load/store
347  // instructions, rather
348  // then generating calls to memset, mempcy or memmove.
349  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
350  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
351  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
352 
355 
356  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
357  // condition branches.
358  setJumpIsExpensive(true);
359 
360  // Wide divides are _very_ slow. Try to reduce the width of the divide if
361  // possible.
362  addBypassSlowDiv(64, 32);
363 
364  // By default, use the Source scheduling
365  if (sched4reg)
367  else
369 
370  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
371  LegalizeAction NoF16Action) {
372  setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
373  };
374 
375  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
376  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
377  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
378  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
379  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
380  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
381  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
382  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
383 
384  // Conversion to/from FP16/FP16x2 is always legal.
391 
392  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
393  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
394 
395  // Operations not directly supported by NVPTX.
400  }
401 
402  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
403  // For others we will expand to a SHL/SRA pair.
409 
416 
419 
420  // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
421  // that don't have h/w rotation we lower them to multi-instruction assembly.
422  // See ROT*_sw in NVPTXIntrInfo.td
427 
435 
436  // Indirect branch is not supported.
437  // This also disables Jump Table creation.
440 
443 
444  // We want to legalize constant related memmove and memcopy
445  // intrinsics.
447 
448  // Turn FP extload into load/fpextend
458  // Turn FP truncstore into trunc + store.
459  // FIXME: vector types should also be expanded
463 
464  // PTX does not support load / store predicate registers
467 
468  for (MVT VT : MVT::integer_valuetypes()) {
472  }
473 
474  // This is legal in NVPTX
478 
479  // TRAP can be lowered to PTX trap
481 
482  // Register custom handling for vector loads/stores
483  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
484  if (IsPTXVectorType(VT)) {
488  }
489  }
490 
491  // Custom handling for i8 intrinsics
493 
494  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
500 
503  }
504 
508 
509  // PTX does not directly support SELP of i1, so promote to i32 first
511 
512  // PTX cannot multiply two i64s in a single instruction.
515 
516  // We have some custom DAG combine patterns for these nodes
524 
525  // setcc for f16x2 needs special handling to prevent legalizer's
526  // attempt to scalarize it due to v2i1 not being legal.
527  if (STI.allowFP16Math())
529 
530  // Promote fp16 arithmetic if fp16 hardware isn't available or the
531  // user passed --nvptx-no-fp16-math. The flag is useful because,
532  // although sm_53+ GPUs have some sort of FP16 support in
533  // hardware, only sm_53 and sm_60 have full implementation. Others
534  // only have token amount of hardware and are likely to run faster
535  // by using fp32 units instead.
536  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
537  setFP16OperationAction(Op, MVT::f16, Legal, Promote);
538  setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
539  }
540 
541  // There's no neg.f16 instruction. Expand to (0-x).
544 
545  // (would be) Library functions.
546 
547  // These map to conversion instructions for scalar FP types.
548  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
549  ISD::FTRUNC}) {
554  }
555 
560 
561 
562  // 'Expand' implements FCOPYSIGN without calling an external library.
567 
568  // These map to corresponding instructions for f32/f64. f16 must be
569  // promoted to f32. v2f16 is expanded to f16, which is then promoted
570  // to f32.
571  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
577  }
582 
583  // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
584  // No FPOW or FREM in PTX.
585 
586  // Now deduce the information based on the above mentioned
587  // actions
589 }
590 
591 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
592  switch ((NVPTXISD::NodeType)Opcode) {
594  break;
595  case NVPTXISD::CALL:
596  return "NVPTXISD::CALL";
597  case NVPTXISD::RET_FLAG:
598  return "NVPTXISD::RET_FLAG";
600  return "NVPTXISD::LOAD_PARAM";
601  case NVPTXISD::Wrapper:
602  return "NVPTXISD::Wrapper";
604  return "NVPTXISD::DeclareParam";
606  return "NVPTXISD::DeclareScalarParam";
608  return "NVPTXISD::DeclareRet";
610  return "NVPTXISD::DeclareScalarRet";
612  return "NVPTXISD::DeclareRetParam";
613  case NVPTXISD::PrintCall:
614  return "NVPTXISD::PrintCall";
616  return "NVPTXISD::PrintConvergentCall";
618  return "NVPTXISD::PrintCallUni";
620  return "NVPTXISD::PrintConvergentCallUni";
621  case NVPTXISD::LoadParam:
622  return "NVPTXISD::LoadParam";
624  return "NVPTXISD::LoadParamV2";
626  return "NVPTXISD::LoadParamV4";
628  return "NVPTXISD::StoreParam";
630  return "NVPTXISD::StoreParamV2";
632  return "NVPTXISD::StoreParamV4";
634  return "NVPTXISD::StoreParamS32";
636  return "NVPTXISD::StoreParamU32";
638  return "NVPTXISD::CallArgBegin";
639  case NVPTXISD::CallArg:
640  return "NVPTXISD::CallArg";
642  return "NVPTXISD::LastCallArg";
644  return "NVPTXISD::CallArgEnd";
645  case NVPTXISD::CallVoid:
646  return "NVPTXISD::CallVoid";
647  case NVPTXISD::CallVal:
648  return "NVPTXISD::CallVal";
650  return "NVPTXISD::CallSymbol";
651  case NVPTXISD::Prototype:
652  return "NVPTXISD::Prototype";
653  case NVPTXISD::MoveParam:
654  return "NVPTXISD::MoveParam";
656  return "NVPTXISD::StoreRetval";
658  return "NVPTXISD::StoreRetvalV2";
660  return "NVPTXISD::StoreRetvalV4";
662  return "NVPTXISD::PseudoUseParam";
663  case NVPTXISD::RETURN:
664  return "NVPTXISD::RETURN";
666  return "NVPTXISD::CallSeqBegin";
668  return "NVPTXISD::CallSeqEnd";
670  return "NVPTXISD::CallPrototype";
671  case NVPTXISD::ProxyReg:
672  return "NVPTXISD::ProxyReg";
673  case NVPTXISD::LoadV2:
674  return "NVPTXISD::LoadV2";
675  case NVPTXISD::LoadV4:
676  return "NVPTXISD::LoadV4";
677  case NVPTXISD::LDGV2:
678  return "NVPTXISD::LDGV2";
679  case NVPTXISD::LDGV4:
680  return "NVPTXISD::LDGV4";
681  case NVPTXISD::LDUV2:
682  return "NVPTXISD::LDUV2";
683  case NVPTXISD::LDUV4:
684  return "NVPTXISD::LDUV4";
685  case NVPTXISD::StoreV2:
686  return "NVPTXISD::StoreV2";
687  case NVPTXISD::StoreV4:
688  return "NVPTXISD::StoreV4";
690  return "NVPTXISD::FUN_SHFL_CLAMP";
692  return "NVPTXISD::FUN_SHFR_CLAMP";
693  case NVPTXISD::IMAD:
694  return "NVPTXISD::IMAD";
696  return "NVPTXISD::SETP_F16X2";
697  case NVPTXISD::Dummy:
698  return "NVPTXISD::Dummy";
700  return "NVPTXISD::MUL_WIDE_SIGNED";
702  return "NVPTXISD::MUL_WIDE_UNSIGNED";
703  case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
704  case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
706  return "NVPTXISD::Tex1DFloatFloatLevel";
708  return "NVPTXISD::Tex1DFloatFloatGrad";
709  case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
710  case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
712  return "NVPTXISD::Tex1DS32FloatLevel";
714  return "NVPTXISD::Tex1DS32FloatGrad";
715  case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
716  case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
718  return "NVPTXISD::Tex1DU32FloatLevel";
720  return "NVPTXISD::Tex1DU32FloatGrad";
721  case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
722  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
724  return "NVPTXISD::Tex1DArrayFloatFloatLevel";
726  return "NVPTXISD::Tex1DArrayFloatFloatGrad";
727  case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
728  case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
730  return "NVPTXISD::Tex1DArrayS32FloatLevel";
732  return "NVPTXISD::Tex1DArrayS32FloatGrad";
733  case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
734  case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
736  return "NVPTXISD::Tex1DArrayU32FloatLevel";
738  return "NVPTXISD::Tex1DArrayU32FloatGrad";
739  case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
740  case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
742  return "NVPTXISD::Tex2DFloatFloatLevel";
744  return "NVPTXISD::Tex2DFloatFloatGrad";
745  case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
746  case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
748  return "NVPTXISD::Tex2DS32FloatLevel";
750  return "NVPTXISD::Tex2DS32FloatGrad";
751  case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
752  case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
754  return "NVPTXISD::Tex2DU32FloatLevel";
756  return "NVPTXISD::Tex2DU32FloatGrad";
757  case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
758  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
760  return "NVPTXISD::Tex2DArrayFloatFloatLevel";
762  return "NVPTXISD::Tex2DArrayFloatFloatGrad";
763  case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
764  case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
766  return "NVPTXISD::Tex2DArrayS32FloatLevel";
768  return "NVPTXISD::Tex2DArrayS32FloatGrad";
769  case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
770  case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
772  return "NVPTXISD::Tex2DArrayU32FloatLevel";
774  return "NVPTXISD::Tex2DArrayU32FloatGrad";
775  case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
776  case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
778  return "NVPTXISD::Tex3DFloatFloatLevel";
780  return "NVPTXISD::Tex3DFloatFloatGrad";
781  case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
782  case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
784  return "NVPTXISD::Tex3DS32FloatLevel";
786  return "NVPTXISD::Tex3DS32FloatGrad";
787  case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
788  case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
790  return "NVPTXISD::Tex3DU32FloatLevel";
792  return "NVPTXISD::Tex3DU32FloatGrad";
793  case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
795  return "NVPTXISD::TexCubeFloatFloatLevel";
796  case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
798  return "NVPTXISD::TexCubeS32FloatLevel";
799  case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
801  return "NVPTXISD::TexCubeU32FloatLevel";
803  return "NVPTXISD::TexCubeArrayFloatFloat";
805  return "NVPTXISD::TexCubeArrayFloatFloatLevel";
807  return "NVPTXISD::TexCubeArrayS32Float";
809  return "NVPTXISD::TexCubeArrayS32FloatLevel";
811  return "NVPTXISD::TexCubeArrayU32Float";
813  return "NVPTXISD::TexCubeArrayU32FloatLevel";
815  return "NVPTXISD::Tld4R2DFloatFloat";
817  return "NVPTXISD::Tld4G2DFloatFloat";
819  return "NVPTXISD::Tld4B2DFloatFloat";
821  return "NVPTXISD::Tld4A2DFloatFloat";
823  return "NVPTXISD::Tld4R2DS64Float";
825  return "NVPTXISD::Tld4G2DS64Float";
827  return "NVPTXISD::Tld4B2DS64Float";
829  return "NVPTXISD::Tld4A2DS64Float";
831  return "NVPTXISD::Tld4R2DU64Float";
833  return "NVPTXISD::Tld4G2DU64Float";
835  return "NVPTXISD::Tld4B2DU64Float";
837  return "NVPTXISD::Tld4A2DU64Float";
838 
840  return "NVPTXISD::TexUnified1DFloatS32";
842  return "NVPTXISD::TexUnified1DFloatFloat";
844  return "NVPTXISD::TexUnified1DFloatFloatLevel";
846  return "NVPTXISD::TexUnified1DFloatFloatGrad";
848  return "NVPTXISD::TexUnified1DS32S32";
850  return "NVPTXISD::TexUnified1DS32Float";
852  return "NVPTXISD::TexUnified1DS32FloatLevel";
854  return "NVPTXISD::TexUnified1DS32FloatGrad";
856  return "NVPTXISD::TexUnified1DU32S32";
858  return "NVPTXISD::TexUnified1DU32Float";
860  return "NVPTXISD::TexUnified1DU32FloatLevel";
862  return "NVPTXISD::TexUnified1DU32FloatGrad";
864  return "NVPTXISD::TexUnified1DArrayFloatS32";
866  return "NVPTXISD::TexUnified1DArrayFloatFloat";
868  return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
870  return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
872  return "NVPTXISD::TexUnified1DArrayS32S32";
874  return "NVPTXISD::TexUnified1DArrayS32Float";
876  return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
878  return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
880  return "NVPTXISD::TexUnified1DArrayU32S32";
882  return "NVPTXISD::TexUnified1DArrayU32Float";
884  return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
886  return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
888  return "NVPTXISD::TexUnified2DFloatS32";
890  return "NVPTXISD::TexUnified2DFloatFloat";
892  return "NVPTXISD::TexUnified2DFloatFloatLevel";
894  return "NVPTXISD::TexUnified2DFloatFloatGrad";
896  return "NVPTXISD::TexUnified2DS32S32";
898  return "NVPTXISD::TexUnified2DS32Float";
900  return "NVPTXISD::TexUnified2DS32FloatLevel";
902  return "NVPTXISD::TexUnified2DS32FloatGrad";
904  return "NVPTXISD::TexUnified2DU32S32";
906  return "NVPTXISD::TexUnified2DU32Float";
908  return "NVPTXISD::TexUnified2DU32FloatLevel";
910  return "NVPTXISD::TexUnified2DU32FloatGrad";
912  return "NVPTXISD::TexUnified2DArrayFloatS32";
914  return "NVPTXISD::TexUnified2DArrayFloatFloat";
916  return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
918  return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
920  return "NVPTXISD::TexUnified2DArrayS32S32";
922  return "NVPTXISD::TexUnified2DArrayS32Float";
924  return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
926  return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
928  return "NVPTXISD::TexUnified2DArrayU32S32";
930  return "NVPTXISD::TexUnified2DArrayU32Float";
932  return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
934  return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
936  return "NVPTXISD::TexUnified3DFloatS32";
938  return "NVPTXISD::TexUnified3DFloatFloat";
940  return "NVPTXISD::TexUnified3DFloatFloatLevel";
942  return "NVPTXISD::TexUnified3DFloatFloatGrad";
944  return "NVPTXISD::TexUnified3DS32S32";
946  return "NVPTXISD::TexUnified3DS32Float";
948  return "NVPTXISD::TexUnified3DS32FloatLevel";
950  return "NVPTXISD::TexUnified3DS32FloatGrad";
952  return "NVPTXISD::TexUnified3DU32S32";
954  return "NVPTXISD::TexUnified3DU32Float";
956  return "NVPTXISD::TexUnified3DU32FloatLevel";
958  return "NVPTXISD::TexUnified3DU32FloatGrad";
960  return "NVPTXISD::TexUnifiedCubeFloatFloat";
962  return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
964  return "NVPTXISD::TexUnifiedCubeS32Float";
966  return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
968  return "NVPTXISD::TexUnifiedCubeU32Float";
970  return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
972  return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
974  return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
976  return "NVPTXISD::TexUnifiedCubeArrayS32Float";
978  return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
980  return "NVPTXISD::TexUnifiedCubeArrayU32Float";
982  return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
984  return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
986  return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
988  return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
990  return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
992  return "NVPTXISD::Tld4UnifiedR2DS64Float";
994  return "NVPTXISD::Tld4UnifiedG2DS64Float";
996  return "NVPTXISD::Tld4UnifiedB2DS64Float";
998  return "NVPTXISD::Tld4UnifiedA2DS64Float";
1000  return "NVPTXISD::Tld4UnifiedR2DU64Float";
1002  return "NVPTXISD::Tld4UnifiedG2DU64Float";
1004  return "NVPTXISD::Tld4UnifiedB2DU64Float";
1006  return "NVPTXISD::Tld4UnifiedA2DU64Float";
1007 
1008  case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1009  case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1010  case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1011  case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1012  case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1013  case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1014  case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1015  case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1016  case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1017  case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1018  case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1019 
1020  case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1021  case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1022  case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1023  case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1024  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1025  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1026  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1027  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1028  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1029  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1030  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1031 
1032  case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1033  case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1034  case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1035  case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1036  case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1037  case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1038  case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1039  case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1040  case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1041  case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1042  case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1043 
1044  case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1045  case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1046  case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1047  case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1048  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1049  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1050  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1051  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1052  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1053  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1054  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1055 
1056  case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1057  case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1058  case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1059  case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1060  case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1061  case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1062  case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1063  case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1064  case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1065  case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1066  case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1067 
1068  case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1069  case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1070  case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1071  case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1072  case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1073  case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1074  case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1075  case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1076  case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1077  case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1078  case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1079 
1080  case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1081  case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1082  case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1083  case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1084  case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1085  case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1086  case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1087  case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1088  case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1089  case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1090  case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1091 
1092  case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1093  case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1094  case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1095  case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1096  case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1097  case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1098  case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1099  case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1100  case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1101  case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1102  case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1103 
1104  case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1105  case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1106  case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1107  case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1108  case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1109  case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1110  case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1111  case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1112  case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1113  case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1114  case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1115 
1116  case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1117  case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1118  case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1119  case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1120  case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1121  case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1122  case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1123  case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1124  case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1125  case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1126  case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1127 
1128  case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1129  case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1130  case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1131  case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1132  case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1133  case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1134  case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1135  case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1136  case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1137  case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1138  case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1139 
1140  case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1141  case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1142  case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1143  case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1144  case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1145  case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1146  case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1147  case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1148  case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1149  case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1150  case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1151 
1152  case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1153  case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1154  case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1155  case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1156  case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1157  case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1158  case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1159  case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1160  case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1161  case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1162  case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1163 
1164  case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1165  case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1166  case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1167  case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1168  case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1169  case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1170  case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1171  case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1172  case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1173  case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1174  case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1175 
1176  case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1177  case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1178  case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1179  case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1180  case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1181  case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1182  case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1183  case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1184  case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1185  case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1186  case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1187  }
1188  return nullptr;
1189 }
1190 
1193  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1194  return TypeSplitVector;
1195  if (VT == MVT::v2f16)
1196  return TypeLegal;
1198 }
1199 
1201  int Enabled, int &ExtraSteps,
1202  bool &UseOneConst,
1203  bool Reciprocal) const {
1204  if (!(Enabled == ReciprocalEstimate::Enabled ||
1205  (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1206  return SDValue();
1207 
1208  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1209  ExtraSteps = 0;
1210 
1211  SDLoc DL(Operand);
1212  EVT VT = Operand.getValueType();
1213  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1214 
1215  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1216  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1217  DAG.getConstant(IID, DL, MVT::i32), Operand);
1218  };
1219 
1220  // The sqrt and rsqrt refinement processes assume we always start out with an
1221  // approximation of the rsqrt. Therefore, if we're going to do any refinement
1222  // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1223  // any refinement, we must return a regular sqrt.
1224  if (Reciprocal || ExtraSteps > 0) {
1225  if (VT == MVT::f32)
1226  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1227  : Intrinsic::nvvm_rsqrt_approx_f);
1228  else if (VT == MVT::f64)
1229  return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1230  else
1231  return SDValue();
1232  } else {
1233  if (VT == MVT::f32)
1234  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1235  : Intrinsic::nvvm_sqrt_approx_f);
1236  else {
1237  // There's no sqrt.approx.f64 instruction, so we emit
1238  // reciprocal(rsqrt(x)). This is faster than
1239  // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1240  // x * rsqrt(x).)
1241  return DAG.getNode(
1242  ISD::INTRINSIC_WO_CHAIN, DL, VT,
1243  DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1244  MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1245  }
1246  }
1247 }
1248 
1249 SDValue
1251  SDLoc dl(Op);
1252  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1253  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1254  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1255  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1256 }
1257 
1259  const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1260  const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1261  ImmutableCallSite CS) const {
1262  auto PtrVT = getPointerTy(DL);
1263 
1264  bool isABI = (STI.getSmVersion() >= 20);
1265  assert(isABI && "Non-ABI compilation is not supported");
1266  if (!isABI)
1267  return "";
1268 
1269  std::stringstream O;
1270  O << "prototype_" << uniqueCallSite << " : .callprototype ";
1271 
1272  if (retTy->getTypeID() == Type::VoidTyID) {
1273  O << "()";
1274  } else {
1275  O << "(";
1276  if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1277  unsigned size = 0;
1278  if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1279  size = ITy->getBitWidth();
1280  } else {
1281  assert(retTy->isFloatingPointTy() &&
1282  "Floating point type expected here");
1283  size = retTy->getPrimitiveSizeInBits();
1284  }
1285  // PTX ABI requires all scalar return values to be at least 32
1286  // bits in size. fp16 normally uses .b16 as its storage type in
1287  // PTX, so its size must be adjusted here, too.
1288  if (size < 32)
1289  size = 32;
1290 
1291  O << ".param .b" << size << " _";
1292  } else if (isa<PointerType>(retTy)) {
1293  O << ".param .b" << PtrVT.getSizeInBits() << " _";
1294  } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
1295  retTy->isIntegerTy(128)) {
1296  O << ".param .align " << retAlignment << " .b8 _["
1297  << DL.getTypeAllocSize(retTy) << "]";
1298  } else {
1299  llvm_unreachable("Unknown return type");
1300  }
1301  O << ") ";
1302  }
1303  O << "_ (";
1304 
1305  bool first = true;
1306 
1307  unsigned OIdx = 0;
1308  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1309  Type *Ty = Args[i].Ty;
1310  if (!first) {
1311  O << ", ";
1312  }
1313  first = false;
1314 
1315  if (!Outs[OIdx].Flags.isByVal()) {
1316  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1317  unsigned align = 0;
1318  const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1319  // +1 because index 0 is reserved for return type alignment
1320  if (!getAlign(*CallI, i + 1, align))
1321  align = DL.getABITypeAlignment(Ty);
1322  unsigned sz = DL.getTypeAllocSize(Ty);
1323  O << ".param .align " << align << " .b8 ";
1324  O << "_";
1325  O << "[" << sz << "]";
1326  // update the index for Outs
1327  SmallVector<EVT, 16> vtparts;
1328  ComputeValueVTs(*this, DL, Ty, vtparts);
1329  if (unsigned len = vtparts.size())
1330  OIdx += len - 1;
1331  continue;
1332  }
1333  // i8 types in IR will be i16 types in SDAG
1334  assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1335  (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1336  "type mismatch between callee prototype and arguments");
1337  // scalar type
1338  unsigned sz = 0;
1339  if (isa<IntegerType>(Ty)) {
1340  sz = cast<IntegerType>(Ty)->getBitWidth();
1341  if (sz < 32)
1342  sz = 32;
1343  } else if (isa<PointerType>(Ty)) {
1344  sz = PtrVT.getSizeInBits();
1345  } else if (Ty->isHalfTy())
1346  // PTX ABI requires all scalar parameters to be at least 32
1347  // bits in size. fp16 normally uses .b16 as its storage type
1348  // in PTX, so its size must be adjusted here, too.
1349  sz = 32;
1350  else
1351  sz = Ty->getPrimitiveSizeInBits();
1352  O << ".param .b" << sz << " ";
1353  O << "_";
1354  continue;
1355  }
1356  auto *PTy = dyn_cast<PointerType>(Ty);
1357  assert(PTy && "Param with byval attribute should be a pointer type");
1358  Type *ETy = PTy->getElementType();
1359 
1360  unsigned align = Outs[OIdx].Flags.getByValAlign();
1361  unsigned sz = DL.getTypeAllocSize(ETy);
1362  O << ".param .align " << align << " .b8 ";
1363  O << "_";
1364  O << "[" << sz << "]";
1365  }
1366  O << ");";
1367  return O.str();
1368 }
1369 
1370 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1371  ImmutableCallSite CS,
1372  Type *Ty, unsigned Idx,
1373  const DataLayout &DL) const {
1374  if (!CS) {
1375  // CallSite is zero, fallback to ABI type alignment
1376  return DL.getABITypeAlignment(Ty);
1377  }
1378 
1379  unsigned Align = 0;
1380  const Value *DirectCallee = CS.getCalledFunction();
1381 
1382  if (!DirectCallee) {
1383  // We don't have a direct function symbol, but that may be because of
1384  // constant cast instructions in the call.
1385  const Instruction *CalleeI = CS.getInstruction();
1386  assert(CalleeI && "Call target is not a function or derived value?");
1387 
1388  // With bitcast'd call targets, the instruction will be the call
1389  if (isa<CallInst>(CalleeI)) {
1390  // Check if we have call alignment metadata
1391  if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1392  return Align;
1393 
1394  const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1395  // Ignore any bitcast instructions
1396  while (isa<ConstantExpr>(CalleeV)) {
1397  const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1398  if (!CE->isCast())
1399  break;
1400  // Look through the bitcast
1401  CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1402  }
1403 
1404  // We have now looked past all of the bitcasts. Do we finally have a
1405  // Function?
1406  if (isa<Function>(CalleeV))
1407  DirectCallee = CalleeV;
1408  }
1409  }
1410 
1411  // Check for function alignment information if we found that the
1412  // ultimate target is a Function
1413  if (DirectCallee)
1414  if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1415  return Align;
1416 
1417  // Call is indirect or alignment information is not available, fall back to
1418  // the ABI type alignment
1419  return DL.getABITypeAlignment(Ty);
1420 }
1421 
1423  SmallVectorImpl<SDValue> &InVals) const {
1424  SelectionDAG &DAG = CLI.DAG;
1425  SDLoc dl = CLI.DL;
1427  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1429  SDValue Chain = CLI.Chain;
1430  SDValue Callee = CLI.Callee;
1431  bool &isTailCall = CLI.IsTailCall;
1432  ArgListTy &Args = CLI.getArgs();
1433  Type *RetTy = CLI.RetTy;
1434  ImmutableCallSite CS = CLI.CS;
1435  const DataLayout &DL = DAG.getDataLayout();
1436 
1437  bool isABI = (STI.getSmVersion() >= 20);
1438  assert(isABI && "Non-ABI compilation is not supported");
1439  if (!isABI)
1440  return Chain;
1441 
1442  SDValue tempChain = Chain;
1443  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1444  SDValue InFlag = Chain.getValue(1);
1445 
1446  unsigned paramCount = 0;
1447  // Args.size() and Outs.size() need not match.
1448  // Outs.size() will be larger
1449  // * if there is an aggregate argument with multiple fields (each field
1450  // showing up separately in Outs)
1451  // * if there is a vector argument with more than typical vector-length
1452  // elements (generally if more than 4) where each vector element is
1453  // individually present in Outs.
1454  // So a different index should be used for indexing into Outs/OutVals.
1455  // See similar issue in LowerFormalArguments.
1456  unsigned OIdx = 0;
1457  // Declare the .params or .reg need to pass values
1458  // to the function
1459  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1460  EVT VT = Outs[OIdx].VT;
1461  Type *Ty = Args[i].Ty;
1462 
1463  if (!Outs[OIdx].Flags.isByVal()) {
1466  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1467  unsigned ArgAlign =
1468  getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1469  unsigned AllocSize = DL.getTypeAllocSize(Ty);
1470  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1471  bool NeedAlign; // Does argument declaration specify alignment?
1472  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1473  // declare .param .align <align> .b8 .param<n>[<size>];
1474  SDValue DeclareParamOps[] = {
1475  Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1476  DAG.getConstant(paramCount, dl, MVT::i32),
1477  DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1478  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1479  DeclareParamOps);
1480  NeedAlign = true;
1481  } else {
1482  // declare .param .b<size> .param<n>;
1483  if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1484  // PTX ABI requires integral types to be at least 32 bits in
1485  // size. FP16 is loaded/stored using i16, so it's handled
1486  // here as well.
1487  AllocSize = 4;
1488  }
1489  SDValue DeclareScalarParamOps[] = {
1490  Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1491  DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1492  DAG.getConstant(0, dl, MVT::i32), InFlag};
1493  Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1494  DeclareScalarParamOps);
1495  NeedAlign = false;
1496  }
1497  InFlag = Chain.getValue(1);
1498 
1499  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1500  // than 32-bits are sign extended or zero extended, depending on
1501  // whether they are signed or unsigned types. This case applies
1502  // only to scalar parameters and not to aggregate values.
1503  bool ExtendIntegerParam =
1504  Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1505 
1506  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1507  SmallVector<SDValue, 6> StoreOperands;
1508  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1509  // New store.
1510  if (VectorInfo[j] & PVF_FIRST) {
1511  assert(StoreOperands.empty() && "Unfinished preceding store.");
1512  StoreOperands.push_back(Chain);
1513  StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1514  StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1515  }
1516 
1517  EVT EltVT = VTs[j];
1518  SDValue StVal = OutVals[OIdx];
1519  if (ExtendIntegerParam) {
1520  assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1521  // zext/sext to i32
1522  StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1523  : ISD::ZERO_EXTEND,
1524  dl, MVT::i32, StVal);
1525  } else if (EltVT.getSizeInBits() < 16) {
1526  // Use 16-bit registers for small stores as it's the
1527  // smallest general purpose register size supported by NVPTX.
1528  StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1529  }
1530 
1531  // Record the value to store.
1532  StoreOperands.push_back(StVal);
1533 
1534  if (VectorInfo[j] & PVF_LAST) {
1535  unsigned NumElts = StoreOperands.size() - 3;
1537  switch (NumElts) {
1538  case 1:
1539  Op = NVPTXISD::StoreParam;
1540  break;
1541  case 2:
1543  break;
1544  case 4:
1546  break;
1547  default:
1548  llvm_unreachable("Invalid vector info.");
1549  }
1550 
1551  StoreOperands.push_back(InFlag);
1552 
1553  // Adjust type of the store op if we've extended the scalar
1554  // return value.
1555  EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1556  unsigned EltAlign =
1557  NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
1558 
1559  Chain = DAG.getMemIntrinsicNode(
1560  Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1561  TheStoreType, MachinePointerInfo(), EltAlign,
1563  InFlag = Chain.getValue(1);
1564 
1565  // Cleanup.
1566  StoreOperands.clear();
1567  }
1568  ++OIdx;
1569  }
1570  assert(StoreOperands.empty() && "Unfinished parameter store.");
1571  if (VTs.size() > 0)
1572  --OIdx;
1573  ++paramCount;
1574  continue;
1575  }
1576 
1577  // ByVal arguments
1580  auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1581  assert(PTy && "Type of a byval parameter should be pointer");
1582  ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1583 
1584  // declare .param .align <align> .b8 .param<n>[<size>];
1585  unsigned sz = Outs[OIdx].Flags.getByValSize();
1586  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1587  unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1588  // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1589  // so we don't need to worry about natural alignment or not.
1590  // See TargetLowering::LowerCallTo().
1591 
1592  // Enforce minumum alignment of 4 to work around ptxas miscompile
1593  // for sm_50+. See corresponding alignment adjustment in
1594  // emitFunctionParamList() for details.
1595  if (ArgAlign < 4)
1596  ArgAlign = 4;
1597  SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1598  DAG.getConstant(paramCount, dl, MVT::i32),
1599  DAG.getConstant(sz, dl, MVT::i32), InFlag};
1600  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1601  DeclareParamOps);
1602  InFlag = Chain.getValue(1);
1603  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1604  EVT elemtype = VTs[j];
1605  int curOffset = Offsets[j];
1606  unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1607  auto PtrVT = getPointerTy(DL);
1608  SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1609  DAG.getConstant(curOffset, dl, PtrVT));
1610  SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1611  MachinePointerInfo(), PartAlign);
1612  if (elemtype.getSizeInBits() < 16) {
1613  theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1614  }
1615  SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1616  SDValue CopyParamOps[] = { Chain,
1617  DAG.getConstant(paramCount, dl, MVT::i32),
1618  DAG.getConstant(curOffset, dl, MVT::i32),
1619  theVal, InFlag };
1620  Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1621  CopyParamOps, elemtype,
1622  MachinePointerInfo(), /* Align */ 0,
1624 
1625  InFlag = Chain.getValue(1);
1626  }
1627  ++paramCount;
1628  }
1629 
1631  unsigned retAlignment = 0;
1632 
1633  // Handle Result
1634  if (Ins.size() > 0) {
1635  SmallVector<EVT, 16> resvtparts;
1636  ComputeValueVTs(*this, DL, RetTy, resvtparts);
1637 
1638  // Declare
1639  // .param .align 16 .b8 retval0[<size-in-bytes>], or
1640  // .param .b<size-in-bits> retval0
1641  unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1642  // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1643  // these three types to match the logic in
1644  // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1645  // Plus, this behavior is consistent with nvcc's.
1646  if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1647  (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1648  // Scalar needs to be at least 32bit wide
1649  if (resultsz < 32)
1650  resultsz = 32;
1651  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1652  SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1653  DAG.getConstant(resultsz, dl, MVT::i32),
1654  DAG.getConstant(0, dl, MVT::i32), InFlag };
1655  Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1656  DeclareRetOps);
1657  InFlag = Chain.getValue(1);
1658  } else {
1659  retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1660  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1661  SDValue DeclareRetOps[] = { Chain,
1662  DAG.getConstant(retAlignment, dl, MVT::i32),
1663  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1664  DAG.getConstant(0, dl, MVT::i32), InFlag };
1665  Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1666  DeclareRetOps);
1667  InFlag = Chain.getValue(1);
1668  }
1669  }
1670 
1671  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1672  // between them we must rely on the call site value which is valid for
1673  // indirect calls but is always null for libcalls.
1674  bool isIndirectCall = !Func && CS;
1675 
1676  if (isa<ExternalSymbolSDNode>(Callee)) {
1677  Function* CalleeFunc = nullptr;
1678 
1679  // Try to find the callee in the current module.
1680  Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1681  assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1682 
1683  // Set the "libcall callee" attribute to indicate that the function
1684  // must always have a declaration.
1685  CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1686  }
1687 
1688  if (isIndirectCall) {
1689  // This is indirect function call case : PTX requires a prototype of the
1690  // form
1691  // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1692  // to be emitted, and the label has to used as the last arg of call
1693  // instruction.
1694  // The prototype is embedded in a string and put as the operand for a
1695  // CallPrototype SDNode which will print out to the value of the string.
1696  SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1697  std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1698  const char *ProtoStr =
1699  nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1700  SDValue ProtoOps[] = {
1701  Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1702  };
1703  Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1704  InFlag = Chain.getValue(1);
1705  }
1706  // Op to just print "call"
1707  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1708  SDValue PrintCallOps[] = {
1709  Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1710  };
1711  // We model convergent calls as separate opcodes.
1712  unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1713  if (CLI.IsConvergent)
1716  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1717  InFlag = Chain.getValue(1);
1718 
1719  // Ops to print out the function name
1720  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1721  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1722  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1723  InFlag = Chain.getValue(1);
1724 
1725  // Ops to print out the param list
1726  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1727  SDValue CallArgBeginOps[] = { Chain, InFlag };
1728  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1729  CallArgBeginOps);
1730  InFlag = Chain.getValue(1);
1731 
1732  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1733  unsigned opcode;
1734  if (i == (e - 1))
1735  opcode = NVPTXISD::LastCallArg;
1736  else
1737  opcode = NVPTXISD::CallArg;
1738  SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1739  SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1740  DAG.getConstant(i, dl, MVT::i32), InFlag };
1741  Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1742  InFlag = Chain.getValue(1);
1743  }
1744  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1745  SDValue CallArgEndOps[] = { Chain,
1746  DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1747  InFlag };
1748  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1749  InFlag = Chain.getValue(1);
1750 
1751  if (isIndirectCall) {
1752  SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1753  SDValue PrototypeOps[] = { Chain,
1755  InFlag };
1756  Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1757  InFlag = Chain.getValue(1);
1758  }
1759 
1760  SmallVector<SDValue, 16> ProxyRegOps;
1761  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1762 
1763  // Generate loads from param memory/moves from registers for result
1764  if (Ins.size() > 0) {
1767  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1768  assert(VTs.size() == Ins.size() && "Bad value decomposition");
1769 
1770  unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1771  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1772 
1773  SmallVector<EVT, 6> LoadVTs;
1774  int VecIdx = -1; // Index of the first element of the vector.
1775 
1776  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1777  // 32-bits are sign extended or zero extended, depending on whether
1778  // they are signed or unsigned types.
1779  bool ExtendIntegerRetVal =
1780  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1781 
1782  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1783  bool needTruncate = false;
1784  EVT TheLoadType = VTs[i];
1785  EVT EltType = Ins[i].VT;
1786  unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1787  if (ExtendIntegerRetVal) {
1788  TheLoadType = MVT::i32;
1789  EltType = MVT::i32;
1790  needTruncate = true;
1791  } else if (TheLoadType.getSizeInBits() < 16) {
1792  if (VTs[i].isInteger())
1793  needTruncate = true;
1794  EltType = MVT::i16;
1795  }
1796 
1797  // Record index of the very first element of the vector.
1798  if (VectorInfo[i] & PVF_FIRST) {
1799  assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1800  VecIdx = i;
1801  }
1802 
1803  LoadVTs.push_back(EltType);
1804 
1805  if (VectorInfo[i] & PVF_LAST) {
1806  unsigned NumElts = LoadVTs.size();
1807  LoadVTs.push_back(MVT::Other);
1808  LoadVTs.push_back(MVT::Glue);
1810  switch (NumElts) {
1811  case 1:
1812  Op = NVPTXISD::LoadParam;
1813  break;
1814  case 2:
1815  Op = NVPTXISD::LoadParamV2;
1816  break;
1817  case 4:
1818  Op = NVPTXISD::LoadParamV4;
1819  break;
1820  default:
1821  llvm_unreachable("Invalid vector info.");
1822  }
1823 
1824  SDValue LoadOperands[] = {
1825  Chain, DAG.getConstant(1, dl, MVT::i32),
1826  DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1827  SDValue RetVal = DAG.getMemIntrinsicNode(
1828  Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1829  MachinePointerInfo(), EltAlign,
1831 
1832  for (unsigned j = 0; j < NumElts; ++j) {
1833  ProxyRegOps.push_back(RetVal.getValue(j));
1834 
1835  if (needTruncate)
1836  ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1837  else
1838  ProxyRegTruncates.push_back(Optional<MVT>());
1839  }
1840 
1841  Chain = RetVal.getValue(NumElts);
1842  InFlag = RetVal.getValue(NumElts + 1);
1843 
1844  // Cleanup
1845  VecIdx = -1;
1846  LoadVTs.clear();
1847  }
1848  }
1849  }
1850 
1851  Chain = DAG.getCALLSEQ_END(Chain,
1852  DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1853  DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1854  true),
1855  InFlag, dl);
1856  InFlag = Chain.getValue(1);
1857  uniqueCallSite++;
1858 
1859  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1860  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1861  // dangling.
1862  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1863  SDValue Ret = DAG.getNode(
1864  NVPTXISD::ProxyReg, dl,
1865  DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1866  { Chain, ProxyRegOps[i], InFlag }
1867  );
1868 
1869  Chain = Ret.getValue(1);
1870  InFlag = Ret.getValue(2);
1871 
1872  if (ProxyRegTruncates[i].hasValue()) {
1873  Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1874  }
1875 
1876  InVals.push_back(Ret);
1877  }
1878 
1879  // set isTailCall to false for now, until we figure out how to express
1880  // tail call optimization in PTX
1881  isTailCall = false;
1882  return Chain;
1883 }
1884 
1885 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1886 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1887 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1888 SDValue
1889 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1890  SDNode *Node = Op.getNode();
1891  SDLoc dl(Node);
1893  unsigned NumOperands = Node->getNumOperands();
1894  for (unsigned i = 0; i < NumOperands; ++i) {
1895  SDValue SubOp = Node->getOperand(i);
1896  EVT VVT = SubOp.getNode()->getValueType(0);
1897  EVT EltVT = VVT.getVectorElementType();
1898  unsigned NumSubElem = VVT.getVectorNumElements();
1899  for (unsigned j = 0; j < NumSubElem; ++j) {
1900  Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1901  DAG.getIntPtrConstant(j, dl)));
1902  }
1903  }
1904  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1905 }
1906 
1907 // We can init constant f16x2 with a single .b32 move. Normally it
1908 // would get lowered as two constant loads and vector-packing move.
1909 // mov.b16 %h1, 0x4000;
1910 // mov.b16 %h2, 0x3C00;
1911 // mov.b32 %hh2, {%h2, %h1};
1912 // Instead we want just a constant move:
1913 // mov.b32 %hh2, 0x40003C00
1914 //
1915 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1916 // generates good SASS in both cases.
1917 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1918  SelectionDAG &DAG) const {
1919  //return Op;
1920  if (!(Op->getValueType(0) == MVT::v2f16 &&
1921  isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1922  isa<ConstantFPSDNode>(Op->getOperand(1))))
1923  return Op;
1924 
1925  APInt E0 =
1926  cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1927  APInt E1 =
1928  cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1929  SDValue Const =
1930  DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1931  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1932 }
1933 
1934 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1935  SelectionDAG &DAG) const {
1936  SDValue Index = Op->getOperand(1);
1937  // Constant index will be matched by tablegen.
1938  if (isa<ConstantSDNode>(Index.getNode()))
1939  return Op;
1940 
1941  // Extract individual elements and select one of them.
1942  SDValue Vector = Op->getOperand(0);
1943  EVT VectorVT = Vector.getValueType();
1944  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1945  EVT EltVT = VectorVT.getVectorElementType();
1946 
1947  SDLoc dl(Op.getNode());
1948  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1949  DAG.getIntPtrConstant(0, dl));
1950  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1951  DAG.getIntPtrConstant(1, dl));
1952  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1954 }
1955 
1956 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1957 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1958 /// amount, or
1959 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1960 /// amount.
1961 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1962  SelectionDAG &DAG) const {
1963  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1965 
1966  EVT VT = Op.getValueType();
1967  unsigned VTBits = VT.getSizeInBits();
1968  SDLoc dl(Op);
1969  SDValue ShOpLo = Op.getOperand(0);
1970  SDValue ShOpHi = Op.getOperand(1);
1971  SDValue ShAmt = Op.getOperand(2);
1972  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1973 
1974  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1975  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1976  // {dHi, dLo} = {aHi, aLo} >> Amt
1977  // dHi = aHi >> Amt
1978  // dLo = shf.r.clamp aLo, aHi, Amt
1979 
1980  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1981  SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1982  ShAmt);
1983 
1984  SDValue Ops[2] = { Lo, Hi };
1985  return DAG.getMergeValues(Ops, dl);
1986  }
1987  else {
1988  // {dHi, dLo} = {aHi, aLo} >> Amt
1989  // - if (Amt>=size) then
1990  // dLo = aHi >> (Amt-size)
1991  // dHi = aHi >> Amt (this is either all 0 or all 1)
1992  // else
1993  // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1994  // dHi = aHi >> Amt
1995 
1996  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1997  DAG.getConstant(VTBits, dl, MVT::i32),
1998  ShAmt);
1999  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2000  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2001  DAG.getConstant(VTBits, dl, MVT::i32));
2002  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2003  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2004  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2005 
2006  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2007  DAG.getConstant(VTBits, dl, MVT::i32),
2008  ISD::SETGE);
2009  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2010  SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2011 
2012  SDValue Ops[2] = { Lo, Hi };
2013  return DAG.getMergeValues(Ops, dl);
2014  }
2015 }
2016 
2017 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2018 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2019 /// amount, or
2020 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2021 /// amount.
2022 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2023  SelectionDAG &DAG) const {
2024  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2025  assert(Op.getOpcode() == ISD::SHL_PARTS);
2026 
2027  EVT VT = Op.getValueType();
2028  unsigned VTBits = VT.getSizeInBits();
2029  SDLoc dl(Op);
2030  SDValue ShOpLo = Op.getOperand(0);
2031  SDValue ShOpHi = Op.getOperand(1);
2032  SDValue ShAmt = Op.getOperand(2);
2033 
2034  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2035  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2036  // {dHi, dLo} = {aHi, aLo} << Amt
2037  // dHi = shf.l.clamp aLo, aHi, Amt
2038  // dLo = aLo << Amt
2039 
2040  SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2041  ShAmt);
2042  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2043 
2044  SDValue Ops[2] = { Lo, Hi };
2045  return DAG.getMergeValues(Ops, dl);
2046  }
2047  else {
2048  // {dHi, dLo} = {aHi, aLo} << Amt
2049  // - if (Amt>=size) then
2050  // dLo = aLo << Amt (all 0)
2051  // dLo = aLo << (Amt-size)
2052  // else
2053  // dLo = aLo << Amt
2054  // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2055 
2056  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2057  DAG.getConstant(VTBits, dl, MVT::i32),
2058  ShAmt);
2059  SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2060  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2061  DAG.getConstant(VTBits, dl, MVT::i32));
2062  SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2063  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2064  SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2065 
2066  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2067  DAG.getConstant(VTBits, dl, MVT::i32),
2068  ISD::SETGE);
2069  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2070  SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2071 
2072  SDValue Ops[2] = { Lo, Hi };
2073  return DAG.getMergeValues(Ops, dl);
2074  }
2075 }
2076 
2077 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2078  EVT VT = Op.getValueType();
2079 
2080  if (VT == MVT::f32)
2081  return LowerFROUND32(Op, DAG);
2082 
2083  if (VT == MVT::f64)
2084  return LowerFROUND64(Op, DAG);
2085 
2086  llvm_unreachable("unhandled type");
2087 }
2088 
2089 // This is the the rounding method used in CUDA libdevice in C like code:
2090 // float roundf(float A)
2091 // {
2092 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2093 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2094 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2095 // }
2096 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2097  SelectionDAG &DAG) const {
2098  SDLoc SL(Op);
2099  SDValue A = Op.getOperand(0);
2100  EVT VT = Op.getValueType();
2101 
2102  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2103 
2104  // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2105  SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2106  const int SignBitMask = 0x80000000;
2107  SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2108  DAG.getConstant(SignBitMask, SL, MVT::i32));
2109  const int PointFiveInBits = 0x3F000000;
2110  SDValue PointFiveWithSignRaw =
2111  DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2112  DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2113  SDValue PointFiveWithSign =
2114  DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2115  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2116  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2117 
2118  // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2119  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2120  SDValue IsLarge =
2121  DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2122  ISD::SETOGT);
2123  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2124 
2125  // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2126  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2127  DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2128  SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2129  return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2130 }
2131 
2132 // The implementation of round(double) is similar to that of round(float) in
2133 // that they both separate the value range into three regions and use a method
2134 // specific to the region to round the values. However, round(double) first
2135 // calculates the round of the absolute value and then adds the sign back while
2136 // round(float) directly rounds the value with sign.
2137 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2138  SelectionDAG &DAG) const {
2139  SDLoc SL(Op);
2140  SDValue A = Op.getOperand(0);
2141  EVT VT = Op.getValueType();
2142 
2143  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2144 
2145  // double RoundedA = (double) (int) (abs(A) + 0.5f);
2146  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2147  DAG.getConstantFP(0.5, SL, VT));
2148  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2149 
2150  // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2151  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2152  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2153  DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2154  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2155  DAG.getConstantFP(0, SL, VT),
2156  RoundedA);
2157 
2158  // Add sign to rounded_A
2159  RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2160  DAG.getNode(ISD::FTRUNC, SL, VT, A);
2161 
2162  // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2163  SDValue IsLarge =
2164  DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2165  ISD::SETOGT);
2166  return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2167 }
2168 
2169 
2170 
2171 SDValue
2173  switch (Op.getOpcode()) {
2174  case ISD::RETURNADDR:
2175  return SDValue();
2176  case ISD::FRAMEADDR:
2177  return SDValue();
2178  case ISD::GlobalAddress:
2179  return LowerGlobalAddress(Op, DAG);
2181  return Op;
2182  case ISD::BUILD_VECTOR:
2183  return LowerBUILD_VECTOR(Op, DAG);
2185  return Op;
2187  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2188  case ISD::CONCAT_VECTORS:
2189  return LowerCONCAT_VECTORS(Op, DAG);
2190  case ISD::STORE:
2191  return LowerSTORE(Op, DAG);
2192  case ISD::LOAD:
2193  return LowerLOAD(Op, DAG);
2194  case ISD::SHL_PARTS:
2195  return LowerShiftLeftParts(Op, DAG);
2196  case ISD::SRA_PARTS:
2197  case ISD::SRL_PARTS:
2198  return LowerShiftRightParts(Op, DAG);
2199  case ISD::SELECT:
2200  return LowerSelect(Op, DAG);
2201  case ISD::FROUND:
2202  return LowerFROUND(Op, DAG);
2203  default:
2204  llvm_unreachable("Custom lowering not defined for operation");
2205  }
2206 }
2207 
2208 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2209  SDValue Op0 = Op->getOperand(0);
2210  SDValue Op1 = Op->getOperand(1);
2211  SDValue Op2 = Op->getOperand(2);
2212  SDLoc DL(Op.getNode());
2213 
2214  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2215 
2216  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2217  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2218  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2219  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2220 
2221  return Trunc;
2222 }
2223 
2224 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2225  if (Op.getValueType() == MVT::i1)
2226  return LowerLOADi1(Op, DAG);
2227 
2228  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2229  // loads and have to handle it here.
2230  if (Op.getValueType() == MVT::v2f16) {
2231  LoadSDNode *Load = cast<LoadSDNode>(Op);
2232  EVT MemVT = Load->getMemoryVT();
2233  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2234  *Load->getMemOperand())) {
2235  SDValue Ops[2];
2236  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2237  return DAG.getMergeValues(Ops, SDLoc(Op));
2238  }
2239  }
2240 
2241  return SDValue();
2242 }
2243 
2244 // v = ld i1* addr
2245 // =>
2246 // v1 = ld i8* addr (-> i16)
2247 // v = trunc i16 to i1
2248 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2249  SDNode *Node = Op.getNode();
2250  LoadSDNode *LD = cast<LoadSDNode>(Node);
2251  SDLoc dl(Node);
2253  assert(Node->getValueType(0) == MVT::i1 &&
2254  "Custom lowering for i1 load only");
2255  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2256  LD->getPointerInfo(), LD->getAlignment(),
2257  LD->getMemOperand()->getFlags());
2258  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2259  // The legalizer (the caller) is expecting two values from the legalized
2260  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2261  // in LegalizeDAG.cpp which also uses MergeValues.
2262  SDValue Ops[] = { result, LD->getChain() };
2263  return DAG.getMergeValues(Ops, dl);
2264 }
2265 
2266 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2267  StoreSDNode *Store = cast<StoreSDNode>(Op);
2268  EVT VT = Store->getMemoryVT();
2269 
2270  if (VT == MVT::i1)
2271  return LowerSTOREi1(Op, DAG);
2272 
2273  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2274  // stores and have to handle it here.
2275  if (VT == MVT::v2f16 &&
2276  !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2277  *Store->getMemOperand()))
2278  return expandUnalignedStore(Store, DAG);
2279 
2280  if (VT.isVector())
2281  return LowerSTOREVector(Op, DAG);
2282 
2283  return SDValue();
2284 }
2285 
2286 SDValue
2287 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2288  SDNode *N = Op.getNode();
2289  SDValue Val = N->getOperand(1);
2290  SDLoc DL(N);
2291  EVT ValVT = Val.getValueType();
2292 
2293  if (ValVT.isVector()) {
2294  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2295  // legal. We can (and should) split that into 2 stores of <2 x double> here
2296  // but I'm leaving that as a TODO for now.
2297  if (!ValVT.isSimple())
2298  return SDValue();
2299  switch (ValVT.getSimpleVT().SimpleTy) {
2300  default:
2301  return SDValue();
2302  case MVT::v2i8:
2303  case MVT::v2i16:
2304  case MVT::v2i32:
2305  case MVT::v2i64:
2306  case MVT::v2f16:
2307  case MVT::v2f32:
2308  case MVT::v2f64:
2309  case MVT::v4i8:
2310  case MVT::v4i16:
2311  case MVT::v4i32:
2312  case MVT::v4f16:
2313  case MVT::v4f32:
2314  case MVT::v8f16: // <4 x f16x2>
2315  // This is a "native" vector type
2316  break;
2317  }
2318 
2319  MemSDNode *MemSD = cast<MemSDNode>(N);
2320  const DataLayout &TD = DAG.getDataLayout();
2321 
2322  unsigned Align = MemSD->getAlignment();
2323  unsigned PrefAlign =
2324  TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2325  if (Align < PrefAlign) {
2326  // This store is not sufficiently aligned, so bail out and let this vector
2327  // store be scalarized. Note that we may still be able to emit smaller
2328  // vector stores. For example, if we are storing a <4 x float> with an
2329  // alignment of 8, this check will fail but the legalizer will try again
2330  // with 2 x <2 x float>, which will succeed with an alignment of 8.
2331  return SDValue();
2332  }
2333 
2334  unsigned Opcode = 0;
2335  EVT EltVT = ValVT.getVectorElementType();
2336  unsigned NumElts = ValVT.getVectorNumElements();
2337 
2338  // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2339  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2340  // stored type to i16 and propagate the "real" type as the memory type.
2341  bool NeedExt = false;
2342  if (EltVT.getSizeInBits() < 16)
2343  NeedExt = true;
2344 
2345  bool StoreF16x2 = false;
2346  switch (NumElts) {
2347  default:
2348  return SDValue();
2349  case 2:
2350  Opcode = NVPTXISD::StoreV2;
2351  break;
2352  case 4:
2353  Opcode = NVPTXISD::StoreV4;
2354  break;
2355  case 8:
2356  // v8f16 is a special case. PTX doesn't have st.v8.f16
2357  // instruction. Instead, we split the vector into v2f16 chunks and
2358  // store them with st.v4.b32.
2359  assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2360  Opcode = NVPTXISD::StoreV4;
2361  StoreF16x2 = true;
2362  break;
2363  }
2364 
2366 
2367  // First is the chain
2368  Ops.push_back(N->getOperand(0));
2369 
2370  if (StoreF16x2) {
2371  // Combine f16,f16 -> v2f16
2372  NumElts /= 2;
2373  for (unsigned i = 0; i < NumElts; ++i) {
2374  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2375  DAG.getIntPtrConstant(i * 2, DL));
2376  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2377  DAG.getIntPtrConstant(i * 2 + 1, DL));
2378  SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2379  Ops.push_back(V2);
2380  }
2381  } else {
2382  // Then the split values
2383  for (unsigned i = 0; i < NumElts; ++i) {
2384  SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2385  DAG.getIntPtrConstant(i, DL));
2386  if (NeedExt)
2387  ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2388  Ops.push_back(ExtVal);
2389  }
2390  }
2391 
2392  // Then any remaining arguments
2393  Ops.append(N->op_begin() + 2, N->op_end());
2394 
2395  SDValue NewSt =
2396  DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2397  MemSD->getMemoryVT(), MemSD->getMemOperand());
2398 
2399  // return DCI.CombineTo(N, NewSt, true);
2400  return NewSt;
2401  }
2402 
2403  return SDValue();
2404 }
2405 
2406 // st i1 v, addr
2407 // =>
2408 // v1 = zxt v to i16
2409 // st.u8 i16, addr
2410 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2411  SDNode *Node = Op.getNode();
2412  SDLoc dl(Node);
2413  StoreSDNode *ST = cast<StoreSDNode>(Node);
2414  SDValue Tmp1 = ST->getChain();
2415  SDValue Tmp2 = ST->getBasePtr();
2416  SDValue Tmp3 = ST->getValue();
2417  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2418  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2419  SDValue Result =
2420  DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2421  ST->getAlignment(), ST->getMemOperand()->getFlags());
2422  return Result;
2423 }
2424 
2425 SDValue
2426 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2427  std::string ParamSym;
2428  raw_string_ostream ParamStr(ParamSym);
2429 
2430  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2431  ParamStr.flush();
2432 
2433  std::string *SavedStr =
2434  nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2435  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2436 }
2437 
2438 // Check to see if the kernel argument is image*_t or sampler_t
2439 
2440 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2441  static const char *const specialTypes[] = { "struct._image2d_t",
2442  "struct._image3d_t",
2443  "struct._sampler_t" };
2444 
2445  Type *Ty = arg->getType();
2446  auto *PTy = dyn_cast<PointerType>(Ty);
2447 
2448  if (!PTy)
2449  return false;
2450 
2451  if (!context)
2452  return false;
2453 
2454  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2455  if (!STy || STy->isLiteral())
2456  return false;
2457 
2458  return std::find(std::begin(specialTypes), std::end(specialTypes),
2459  STy->getName()) != std::end(specialTypes);
2460 }
2461 
2463  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2464  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2465  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2466  MachineFunction &MF = DAG.getMachineFunction();
2467  const DataLayout &DL = DAG.getDataLayout();
2468  auto PtrVT = getPointerTy(DAG.getDataLayout());
2469 
2470  const Function *F = &MF.getFunction();
2471  const AttributeList &PAL = F->getAttributes();
2472  const TargetLowering *TLI = STI.getTargetLowering();
2473 
2474  SDValue Root = DAG.getRoot();
2475  std::vector<SDValue> OutChains;
2476 
2477  bool isABI = (STI.getSmVersion() >= 20);
2478  assert(isABI && "Non-ABI compilation is not supported");
2479  if (!isABI)
2480  return Chain;
2481 
2482  std::vector<Type *> argTypes;
2483  std::vector<const Argument *> theArgs;
2484  for (const Argument &I : F->args()) {
2485  theArgs.push_back(&I);
2486  argTypes.push_back(I.getType());
2487  }
2488  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2489  // Ins.size() will be larger
2490  // * if there is an aggregate argument with multiple fields (each field
2491  // showing up separately in Ins)
2492  // * if there is a vector argument with more than typical vector-length
2493  // elements (generally if more than 4) where each vector element is
2494  // individually present in Ins.
2495  // So a different index should be used for indexing into Ins.
2496  // See similar issue in LowerCall.
2497  unsigned InsIdx = 0;
2498 
2499  int idx = 0;
2500  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2501  Type *Ty = argTypes[i];
2502 
2503  // If the kernel argument is image*_t or sampler_t, convert it to
2504  // a i32 constant holding the parameter position. This can later
2505  // matched in the AsmPrinter to output the correct mangled name.
2506  if (isImageOrSamplerVal(
2507  theArgs[i],
2508  (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2509  : nullptr))) {
2510  assert(isKernelFunction(*F) &&
2511  "Only kernels can have image/sampler params");
2512  InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2513  continue;
2514  }
2515 
2516  if (theArgs[i]->use_empty()) {
2517  // argument is dead
2518  if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2519  SmallVector<EVT, 16> vtparts;
2520 
2521  ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2522  assert(vtparts.size() > 0 && "empty aggregate type not expected");
2523  for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2524  ++parti) {
2525  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2526  ++InsIdx;
2527  }
2528  if (vtparts.size() > 0)
2529  --InsIdx;
2530  continue;
2531  }
2532  if (Ty->isVectorTy()) {
2533  EVT ObjectVT = getValueType(DL, Ty);
2534  unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2535  for (unsigned parti = 0; parti < NumRegs; ++parti) {
2536  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2537  ++InsIdx;
2538  }
2539  if (NumRegs > 0)
2540  --InsIdx;
2541  continue;
2542  }
2543  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2544  continue;
2545  }
2546 
2547  // In the following cases, assign a node order of "idx+1"
2548  // to newly created nodes. The SDNodes for params have to
2549  // appear in the same order as their order of appearance
2550  // in the original function. "idx+1" holds that order.
2551  if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2552  bool aggregateIsPacked = false;
2553  if (StructType *STy = dyn_cast<StructType>(Ty))
2554  aggregateIsPacked = STy->isPacked();
2555 
2558  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2559  assert(VTs.size() > 0 && "Unexpected empty type.");
2560  auto VectorInfo =
2561  VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2562 
2563  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2564  int VecIdx = -1; // Index of the first element of the current vector.
2565  for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2566  if (VectorInfo[parti] & PVF_FIRST) {
2567  assert(VecIdx == -1 && "Orphaned vector.");
2568  VecIdx = parti;
2569  }
2570 
2571  // That's the last element of this store op.
2572  if (VectorInfo[parti] & PVF_LAST) {
2573  unsigned NumElts = parti - VecIdx + 1;
2574  EVT EltVT = VTs[parti];
2575  // i1 is loaded/stored as i8.
2576  EVT LoadVT = EltVT;
2577  if (EltVT == MVT::i1)
2578  LoadVT = MVT::i8;
2579  else if (EltVT == MVT::v2f16)
2580  // getLoad needs a vector type, but it can't handle
2581  // vectors which contain v2f16 elements. So we must load
2582  // using i32 here and then bitcast back.
2583  LoadVT = MVT::i32;
2584 
2585  EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2586  SDValue VecAddr =
2587  DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2588  DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2590  EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2591  SDValue P =
2592  DAG.getLoad(VecVT, dl, Root, VecAddr,
2593  MachinePointerInfo(srcValue), aggregateIsPacked,
2596  if (P.getNode())
2597  P.getNode()->setIROrder(idx + 1);
2598  for (unsigned j = 0; j < NumElts; ++j) {
2599  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2600  DAG.getIntPtrConstant(j, dl));
2601  // We've loaded i1 as an i8 and now must truncate it back to i1
2602  if (EltVT == MVT::i1)
2603  Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2604  // v2f16 was loaded as an i32. Now we must bitcast it back.
2605  else if (EltVT == MVT::v2f16)
2606  Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2607  // Extend the element if necessary (e.g. an i8 is loaded
2608  // into an i16 register)
2609  if (Ins[InsIdx].VT.isInteger() &&
2610  Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
2611  unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2612  : ISD::ZERO_EXTEND;
2613  Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2614  }
2615  InVals.push_back(Elt);
2616  }
2617 
2618  // Reset vector tracking state.
2619  VecIdx = -1;
2620  }
2621  ++InsIdx;
2622  }
2623  if (VTs.size() > 0)
2624  --InsIdx;
2625  continue;
2626  }
2627 
2628  // Param has ByVal attribute
2629  // Return MoveParam(param symbol).
2630  // Ideally, the param symbol can be returned directly,
2631  // but when SDNode builder decides to use it in a CopyToReg(),
2632  // machine instruction fails because TargetExternalSymbol
2633  // (not lowered) is target dependent, and CopyToReg assumes
2634  // the source is lowered.
2635  EVT ObjectVT = getValueType(DL, Ty);
2636  assert(ObjectVT == Ins[InsIdx].VT &&
2637  "Ins type did not match function type");
2638  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2639  SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2640  if (p.getNode())
2641  p.getNode()->setIROrder(idx + 1);
2642  InVals.push_back(p);
2643  }
2644 
2645  // Clang will check explicit VarArg and issue error if any. However, Clang
2646  // will let code with
2647  // implicit var arg like f() pass. See bug 617733.
2648  // We treat this case as if the arg list is empty.
2649  // if (F.isVarArg()) {
2650  // assert(0 && "VarArg not supported yet!");
2651  //}
2652 
2653  if (!OutChains.empty())
2654  DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2655 
2656  return Chain;
2657 }
2658 
2659 SDValue
2661  bool isVarArg,
2662  const SmallVectorImpl<ISD::OutputArg> &Outs,
2663  const SmallVectorImpl<SDValue> &OutVals,
2664  const SDLoc &dl, SelectionDAG &DAG) const {
2665  MachineFunction &MF = DAG.getMachineFunction();
2666  Type *RetTy = MF.getFunction().getReturnType();
2667 
2668  bool isABI = (STI.getSmVersion() >= 20);
2669  assert(isABI && "Non-ABI compilation is not supported");
2670  if (!isABI)
2671  return Chain;
2672 
2673  const DataLayout DL = DAG.getDataLayout();
2676  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2677  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2678 
2679  auto VectorInfo = VectorizePTXValueVTs(
2680  VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
2681 
2682  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2683  // 32-bits are sign extended or zero extended, depending on whether
2684  // they are signed or unsigned types.
2685  bool ExtendIntegerRetVal =
2686  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2687 
2688  SmallVector<SDValue, 6> StoreOperands;
2689  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2690  // New load/store. Record chain and offset operands.
2691  if (VectorInfo[i] & PVF_FIRST) {
2692  assert(StoreOperands.empty() && "Orphaned operand list.");
2693  StoreOperands.push_back(Chain);
2694  StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2695  }
2696 
2697  SDValue RetVal = OutVals[i];
2698  if (ExtendIntegerRetVal) {
2699  RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2700  : ISD::ZERO_EXTEND,
2701  dl, MVT::i32, RetVal);
2702  } else if (RetVal.getValueSizeInBits() < 16) {
2703  // Use 16-bit registers for small load-stores as it's the
2704  // smallest general purpose register size supported by NVPTX.
2705  RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2706  }
2707 
2708  // Record the value to return.
2709  StoreOperands.push_back(RetVal);
2710 
2711  // That's the last element of this store op.
2712  if (VectorInfo[i] & PVF_LAST) {
2714  unsigned NumElts = StoreOperands.size() - 2;
2715  switch (NumElts) {
2716  case 1:
2717  Op = NVPTXISD::StoreRetval;
2718  break;
2719  case 2:
2721  break;
2722  case 4:
2724  break;
2725  default:
2726  llvm_unreachable("Invalid vector info.");
2727  }
2728 
2729  // Adjust type of load/store op if we've extended the scalar
2730  // return value.
2731  EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2732  Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2733  StoreOperands, TheStoreType,
2734  MachinePointerInfo(), /* Align */ 1,
2736  // Cleanup vector state.
2737  StoreOperands.clear();
2738  }
2739  }
2740 
2741  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2742 }
2743 
2745  SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2746  SelectionDAG &DAG) const {
2747  if (Constraint.length() > 1)
2748  return;
2749  else
2750  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2751 }
2752 
2753 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2754  switch (Intrinsic) {
2755  default:
2756  return 0;
2757 
2758  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2759  return NVPTXISD::Tex1DFloatS32;
2760  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2762  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2764  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2766  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2767  return NVPTXISD::Tex1DS32S32;
2768  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2769  return NVPTXISD::Tex1DS32Float;
2770  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2772  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2774  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2775  return NVPTXISD::Tex1DU32S32;
2776  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2777  return NVPTXISD::Tex1DU32Float;
2778  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2780  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2782 
2783  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2785  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2787  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2789  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2791  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2793  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2795  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2797  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2799  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2801  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2803  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2805  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2807 
2808  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2809  return NVPTXISD::Tex2DFloatS32;
2810  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2812  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2814  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2816  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2817  return NVPTXISD::Tex2DS32S32;
2818  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2819  return NVPTXISD::Tex2DS32Float;
2820  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2822  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2824  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2825  return NVPTXISD::Tex2DU32S32;
2826  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2827  return NVPTXISD::Tex2DU32Float;
2828  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2830  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2832 
2833  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2835  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2837  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2839  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2841  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2843  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2845  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2847  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2849  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2851  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2853  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2855  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2857 
2858  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2859  return NVPTXISD::Tex3DFloatS32;
2860  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2862  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2864  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2866  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2867  return NVPTXISD::Tex3DS32S32;
2868  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2869  return NVPTXISD::Tex3DS32Float;
2870  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2872  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2874  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2875  return NVPTXISD::Tex3DU32S32;
2876  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2877  return NVPTXISD::Tex3DU32Float;
2878  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2880  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2882 
2883  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2885  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2887  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2889  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2891  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2893  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2895 
2896  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2898  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2900  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2902  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2904  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2906  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2908 
2909  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2911  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2913  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2915  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2917  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2919  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2921  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2923  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2925  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2927  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2929  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2931  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2933 
2934  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2936  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2938  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2940  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2942  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2944  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2946  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2948  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2950  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2952  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2954  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2956  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2958 
2959  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2961  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2963  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2965  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2967  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2969  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2971  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2973  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2975  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2977  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2979  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2981  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2983 
2984  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2986  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2988  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2990  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2992  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2994  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2996  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2998  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3000  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3002  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3004  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3006  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3008 
3009  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3011  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3013  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3015  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3017  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3019  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3021  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3023  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3025  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3027  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3029  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3031  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3033 
3034  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3036  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3038  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3040  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3042  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3044  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3046  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3048  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3050  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3052  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3054  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3056  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3058 
3059  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3061  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3063  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3065  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3067  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3069  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3071 
3072  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3074  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3076  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3078  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3080  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3082  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3084 
3085  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3087  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3089  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3091  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3093  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3095  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3097  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3099  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3101  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3103  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3105  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3107  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3109  }
3110 }
3111 
3112 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3113  switch (Intrinsic) {
3114  default:
3115  return 0;
3116  case Intrinsic::nvvm_suld_1d_i8_clamp:
3117  return NVPTXISD::Suld1DI8Clamp;
3118  case Intrinsic::nvvm_suld_1d_i16_clamp:
3119  return NVPTXISD::Suld1DI16Clamp;
3120  case Intrinsic::nvvm_suld_1d_i32_clamp:
3121  return NVPTXISD::Suld1DI32Clamp;
3122  case Intrinsic::nvvm_suld_1d_i64_clamp:
3123  return NVPTXISD::Suld1DI64Clamp;
3124  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3126  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3128  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3130  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3132  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3134  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3136  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3138  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3140  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3142  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3144  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3146  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3148  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3150  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3152  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3154  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3156  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3158  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3160  case Intrinsic::nvvm_suld_2d_i8_clamp:
3161  return NVPTXISD::Suld2DI8Clamp;
3162  case Intrinsic::nvvm_suld_2d_i16_clamp:
3163  return NVPTXISD::Suld2DI16Clamp;
3164  case Intrinsic::nvvm_suld_2d_i32_clamp:
3165  return NVPTXISD::Suld2DI32Clamp;
3166  case Intrinsic::nvvm_suld_2d_i64_clamp:
3167  return NVPTXISD::Suld2DI64Clamp;
3168  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3170  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3172  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3174  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3176  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3178  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3180  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3182  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3184  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3186  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3188  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3190  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3192  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3194  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3196  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3198  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3200  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3202  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3204  case Intrinsic::nvvm_suld_3d_i8_clamp:
3205  return NVPTXISD::Suld3DI8Clamp;
3206  case Intrinsic::nvvm_suld_3d_i16_clamp:
3207  return NVPTXISD::Suld3DI16Clamp;
3208  case Intrinsic::nvvm_suld_3d_i32_clamp:
3209  return NVPTXISD::Suld3DI32Clamp;
3210  case Intrinsic::nvvm_suld_3d_i64_clamp:
3211  return NVPTXISD::Suld3DI64Clamp;
3212  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3214  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3216  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3218  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3220  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3222  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3224  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3226  case Intrinsic::nvvm_suld_1d_i8_trap:
3227  return NVPTXISD::Suld1DI8Trap;
3228  case Intrinsic::nvvm_suld_1d_i16_trap:
3229  return NVPTXISD::Suld1DI16Trap;
3230  case Intrinsic::nvvm_suld_1d_i32_trap:
3231  return NVPTXISD::Suld1DI32Trap;
3232  case Intrinsic::nvvm_suld_1d_i64_trap:
3233  return NVPTXISD::Suld1DI64Trap;
3234  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3235  return NVPTXISD::Suld1DV2I8Trap;
3236  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3238  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3240  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3242  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3243  return NVPTXISD::Suld1DV4I8Trap;
3244  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3246  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3248  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3250  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3252  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3254  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3256  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3258  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3260  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3262  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3264  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3266  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3268  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3270  case Intrinsic::nvvm_suld_2d_i8_trap:
3271  return NVPTXISD::Suld2DI8Trap;
3272  case Intrinsic::nvvm_suld_2d_i16_trap:
3273  return NVPTXISD::Suld2DI16Trap;
3274  case Intrinsic::nvvm_suld_2d_i32_trap:
3275  return NVPTXISD::Suld2DI32Trap;
3276  case Intrinsic::nvvm_suld_2d_i64_trap:
3277  return NVPTXISD::Suld2DI64Trap;
3278  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3279  return NVPTXISD::Suld2DV2I8Trap;
3280  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3282  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3284  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3286  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3287  return NVPTXISD::Suld2DV4I8Trap;
3288  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3290  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3292  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3294  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3296  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3298  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3300  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3302  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3304  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3306  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3308  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3310  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3312  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3314  case Intrinsic::nvvm_suld_3d_i8_trap:
3315  return NVPTXISD::Suld3DI8Trap;
3316  case Intrinsic::nvvm_suld_3d_i16_trap:
3317  return NVPTXISD::Suld3DI16Trap;
3318  case Intrinsic::nvvm_suld_3d_i32_trap:
3319  return NVPTXISD::Suld3DI32Trap;
3320  case Intrinsic::nvvm_suld_3d_i64_trap:
3321  return NVPTXISD::Suld3DI64Trap;
3322  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3323  return NVPTXISD::Suld3DV2I8Trap;
3324  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3326  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3328  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3330  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3331  return NVPTXISD::Suld3DV4I8Trap;
3332  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3334  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3336  case Intrinsic::nvvm_suld_1d_i8_zero:
3337  return NVPTXISD::Suld1DI8Zero;
3338  case Intrinsic::nvvm_suld_1d_i16_zero:
3339  return NVPTXISD::Suld1DI16Zero;
3340  case Intrinsic::nvvm_suld_1d_i32_zero:
3341  return NVPTXISD::Suld1DI32Zero;
3342  case Intrinsic::nvvm_suld_1d_i64_zero:
3343  return NVPTXISD::Suld1DI64Zero;
3344  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3345  return NVPTXISD::Suld1DV2I8Zero;
3346  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3348  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3350  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3352  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3353  return NVPTXISD::Suld1DV4I8Zero;
3354  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3356  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3358  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3360  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3362  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3364  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3366  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3368  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3370  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3372  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3374  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3376  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3378  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3380  case Intrinsic::nvvm_suld_2d_i8_zero:
3381  return NVPTXISD::Suld2DI8Zero;
3382  case Intrinsic::nvvm_suld_2d_i16_zero:
3383  return NVPTXISD::Suld2DI16Zero;
3384  case Intrinsic::nvvm_suld_2d_i32_zero:
3385  return NVPTXISD::Suld2DI32Zero;
3386  case Intrinsic::nvvm_suld_2d_i64_zero:
3387  return NVPTXISD::Suld2DI64Zero;
3388  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3389  return NVPTXISD::Suld2DV2I8Zero;
3390  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3392  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3394  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3396  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3397  return NVPTXISD::Suld2DV4I8Zero;
3398  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3400  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3402  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3404  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3406  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3408  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3410  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3412  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3414  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3416  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3418  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3420  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3422  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3424  case Intrinsic::nvvm_suld_3d_i8_zero:
3425  return NVPTXISD::Suld3DI8Zero;
3426  case Intrinsic::nvvm_suld_3d_i16_zero:
3427  return NVPTXISD::Suld3DI16Zero;
3428  case Intrinsic::nvvm_suld_3d_i32_zero:
3429  return NVPTXISD::Suld3DI32Zero;
3430  case Intrinsic::nvvm_suld_3d_i64_zero:
3431  return NVPTXISD::Suld3DI64Zero;
3432  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3433  return NVPTXISD::Suld3DV2I8Zero;
3434  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3436  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3438  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3440  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3441  return NVPTXISD::Suld3DV4I8Zero;
3442  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3444  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3446  }
3447 }
3448 
3449 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3450 // TgtMemIntrinsic
3451 // because we need the information that is only available in the "Value" type
3452 // of destination
3453 // pointer. In particular, the address space information.
3455  IntrinsicInfo &Info, const CallInst &I,
3456  MachineFunction &MF, unsigned Intrinsic) const {
3457  switch (Intrinsic) {
3458  default:
3459  return false;
3460  case Intrinsic::nvvm_match_all_sync_i32p:
3461  case Intrinsic::nvvm_match_all_sync_i64p:
3462  Info.opc = ISD::INTRINSIC_W_CHAIN;
3463  // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3464  // in order to model data exchange with other threads, but perform no real
3465  // memory accesses.
3466  Info.memVT = MVT::i1;
3467 
3468  // Our result depends on both our and other thread's arguments.
3470  return true;
3471  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3472  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3473  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3474  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3475  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3476  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3477  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3478  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3479  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3480  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3481  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3482  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3483  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3484  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3485  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3486  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3487  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3488  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3489  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3490  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3491  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3492  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3493  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3494  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3495  Info.opc = ISD::INTRINSIC_W_CHAIN;
3496  Info.memVT = MVT::v8f16;
3497  Info.ptrVal = I.getArgOperand(0);
3498  Info.offset = 0;
3500  Info.align = Align(16);
3501  return true;
3502  }
3503  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3504  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3505  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3506  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3507  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3508  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3509  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3510  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3511  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3512  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3513  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3514  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3515  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3516  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3517  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3518  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: {
3519  Info.opc = ISD::INTRINSIC_W_CHAIN;
3520  Info.memVT = MVT::v2i32;
3521  Info.ptrVal = I.getArgOperand(0);
3522  Info.offset = 0;
3524  Info.align = Align(8);
3525  return true;
3526  }
3527 
3528  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3529  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3530  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3531  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3532  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3533  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3534  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3535  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3536 
3537  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3538  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3539  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3540  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3541  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3542  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3543  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3544  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: {
3545  Info.opc = ISD::INTRINSIC_W_CHAIN;
3546  Info.memVT = MVT::v4i32;
3547  Info.ptrVal = I.getArgOperand(0);
3548  Info.offset = 0;
3550  Info.align = Align(16);
3551  return true;
3552  }
3553 
3554  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3555  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3556  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3557  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3558  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3559  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3560  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3561  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3562 
3563  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3564  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3565  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3566  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3567  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3568  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3569  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3570  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3571  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3572  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3573  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3574  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3575  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3576  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3577  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3578  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3579  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3580  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3581  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3582  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: {
3583  Info.opc = ISD::INTRINSIC_W_CHAIN;
3584  Info.memVT = MVT::i32;
3585  Info.ptrVal = I.getArgOperand(0);
3586  Info.offset = 0;
3588  Info.align = Align(4);
3589  return true;
3590  }
3591 
3592  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3593  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3594  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3595  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3596  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3597  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3598  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3599  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3600  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3601  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3602  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3603  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3604  Info.opc = ISD::INTRINSIC_W_CHAIN;
3605  Info.memVT = MVT::v4f16;
3606  Info.ptrVal = I.getArgOperand(0);
3607  Info.offset = 0;
3609  Info.align = Align(16);
3610  return true;
3611  }
3612 
3613  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3614  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3615  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3616  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3617  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3618  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3619  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3620  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3621  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3622  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3623  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3624  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3625  Info.opc = ISD::INTRINSIC_W_CHAIN;
3626  Info.memVT = MVT::v8f32;
3627  Info.ptrVal = I.getArgOperand(0);
3628  Info.offset = 0;
3630  Info.align = Align(16);
3631  return true;
3632  }
3633 
3634  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3635  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3636  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3637  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3638  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3639  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3640  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3641  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3642  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3643  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3644  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3645  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3646  Info.opc = ISD::INTRINSIC_W_CHAIN;
3647  Info.memVT = MVT::v8i32;
3648  Info.ptrVal = I.getArgOperand(0);
3649  Info.offset = 0;
3651  Info.align = Align(16);
3652  return true;
3653  }
3654 
3655  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3656  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3657  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3658  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3659  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3660  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3661  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3662  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: {
3663  Info.opc = ISD::INTRINSIC_W_CHAIN;
3664  Info.memVT = MVT::v2i32;
3665  Info.ptrVal = I.getArgOperand(0);
3666  Info.offset = 0;
3668  Info.align = Align(8);
3669  return true;
3670  }
3671 
3672  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3673  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3674  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3675  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3676  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3677  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3678  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3679  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3680  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3681  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3682  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3683  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3684  Info.opc = ISD::INTRINSIC_VOID;
3685  Info.memVT = MVT::v4f16;
3686  Info.ptrVal = I.getArgOperand(0);
3687  Info.offset = 0;
3689  Info.align = Align(16);
3690  return true;
3691  }
3692 
3693  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3694  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3695  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3696  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3697  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3698  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3699  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3700  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3701  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3702  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3703  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3704  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3705  Info.opc = ISD::INTRINSIC_VOID;
3706  Info.memVT = MVT::v8f32;
3707  Info.ptrVal = I.getArgOperand(0);
3708  Info.offset = 0;
3710  Info.align = Align(16);
3711  return true;
3712  }
3713 
3714  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3715  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3716  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3717  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3718  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3719  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3720  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3721  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3722  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3723  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3724  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3725  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3726  Info.opc = ISD::INTRINSIC_VOID;
3727  Info.memVT = MVT::v8i32;
3728  Info.ptrVal = I.getArgOperand(0);
3729  Info.offset = 0;
3731  Info.align = Align(16);
3732  return true;
3733  }
3734 
3735  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3736  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3737  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3738  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3739  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3740  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3741  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3742  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3743  Info.opc = ISD::INTRINSIC_VOID;
3744  Info.memVT = MVT::v2i32;
3745  Info.ptrVal = I.getArgOperand(0);
3746  Info.offset = 0;
3748  Info.align = Align(8);
3749  return true;
3750  }
3751 
3752  case Intrinsic::nvvm_atomic_load_inc_32:
3753  case Intrinsic::nvvm_atomic_load_dec_32:
3754 
3755  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3756  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3757  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3758  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3759  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3760  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3761  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3762  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3763  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3764  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3765  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3766  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3767  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3768  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3769  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3770  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3771  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3772  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3773  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3774  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3775  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3776  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3777  auto &DL = I.getModule()->getDataLayout();
3778  Info.opc = ISD::INTRINSIC_W_CHAIN;
3779  Info.memVT = getValueType(DL, I.getType());
3780  Info.ptrVal = I.getArgOperand(0);
3781  Info.offset = 0;
3783  Info.align.reset();
3784  return true;
3785  }
3786 
3787  case Intrinsic::nvvm_ldu_global_i:
3788  case Intrinsic::nvvm_ldu_global_f:
3789  case Intrinsic::nvvm_ldu_global_p: {
3790  auto &DL = I.getModule()->getDataLayout();
3791  Info.opc = ISD::INTRINSIC_W_CHAIN;
3792  if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3793  Info.memVT = getValueType(DL, I.getType());
3794  else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3795  Info.memVT = getPointerTy(DL);
3796  else
3797  Info.memVT = getValueType(DL, I.getType());
3798  Info.ptrVal = I.getArgOperand(0);
3799  Info.offset = 0;
3801  Info.align =
3802  MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
3803 
3804  return true;
3805  }
3806  case Intrinsic::nvvm_ldg_global_i:
3807  case Intrinsic::nvvm_ldg_global_f:
3808  case Intrinsic::nvvm_ldg_global_p: {
3809  auto &DL = I.getModule()->getDataLayout();
3810 
3811  Info.opc = ISD::INTRINSIC_W_CHAIN;
3812  if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3813  Info.memVT = getValueType(DL, I.getType());
3814  else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3815  Info.memVT = getPointerTy(DL);
3816  else
3817  Info.memVT = getValueType(DL, I.getType());
3818  Info.ptrVal = I.getArgOperand(0);
3819  Info.offset = 0;
3821  Info.align =
3822  MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
3823 
3824  return true;
3825  }
3826 
3827  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3828  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3829  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3830  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3831  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3832  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3833  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3834  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3835  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3836  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3837  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3838  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3839  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3840  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3841  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3842  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3843  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3844  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3845  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3846  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3847  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3848  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3849  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3850  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3851  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3852  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3853  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3854  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3855  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3856  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3857  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3858  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3859  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3860  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3861  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3862  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3863  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3864  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3865  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3866  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3867  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3868  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3869  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3870  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3871  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3872  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3873  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3874  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3875  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3876  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3877  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3878  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3879  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3880  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3881  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3882  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3883  Info.opc = getOpcForTextureInstr(Intrinsic);
3884  Info.memVT = MVT::v4f32;
3885  Info.ptrVal = nullptr;
3886  Info.offset = 0;
3888  Info.align = Align(16);
3889  return true;
3890 
3891  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3892  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3893  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3894  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3895  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3896  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3897  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3898  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3899  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3900  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3901  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3902  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3903  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3904  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3905  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3906  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3907  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3908  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3909  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3910  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3911  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3912  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3913  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3914  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3915  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3916  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3917  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3918  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3919  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3920  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3921  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3922  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3923  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3924  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3925  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3926  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3927  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3928  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3929  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3930  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3931  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3932  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3933  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3934  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3935  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3936  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3937  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3938  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3939  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3940  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3941  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3942  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3943  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3944  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3945  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3946  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3947  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3948  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3949  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3950  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3951  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3952  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3953  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3954  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3955  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3956  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3957  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3958  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3959  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3960  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3961  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3962  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3963  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3964  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3965  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3966  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3967  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3968  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3969  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3970  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3971  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3972  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3973  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3974  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3975  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3976  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3977  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3978  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3979  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3980  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3981  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3982  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3983  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3984  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3985  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3986  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3987  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3988  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3989  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3990  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3991  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3992  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3993  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3994  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3995  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3996  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3997  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3998  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3999  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4000  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4001  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4002  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4003  Info.opc = getOpcForTextureInstr(Intrinsic);
4004  Info.memVT = MVT::v4i32;
4005  Info.ptrVal = nullptr;
4006  Info.offset = 0;
4008  Info.align = Align(16);
4009  return true;
4010 
4011  case Intrinsic::nvvm_suld_1d_i8_clamp:
4012  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4013  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4014  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4015  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4016  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4017  case Intrinsic::nvvm_suld_2d_i8_clamp:
4018  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4019  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4020  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4021  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4022  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4023  case Intrinsic::nvvm_suld_3d_i8_clamp:
4024  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4025  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4026  case Intrinsic::nvvm_suld_1d_i8_trap:
4027  case Intrinsic::nvvm_suld_1d_v2i8_trap:
4028  case Intrinsic::nvvm_suld_1d_v4i8_trap:
4029  case Intrinsic::nvvm_suld_1d_array_i8_trap:
4030  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4031  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4032  case Intrinsic::nvvm_suld_2d_i8_trap:
4033  case Intrinsic::nvvm_suld_2d_v2i8_trap:
4034  case Intrinsic::nvvm_suld_2d_v4i8_trap:
4035  case Intrinsic::nvvm_suld_2d_array_i8_trap:
4036  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4037  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4038  case Intrinsic::nvvm_suld_3d_i8_trap:
4039  case Intrinsic::nvvm_suld_3d_v2i8_trap:
4040  case Intrinsic::nvvm_suld_3d_v4i8_trap:
4041  case Intrinsic::nvvm_suld_1d_i8_zero:
4042  case Intrinsic::nvvm_suld_1d_v2i8_zero:
4043  case Intrinsic::nvvm_suld_1d_v4i8_zero:
4044  case Intrinsic::nvvm_suld_1d_array_i8_zero:
4045  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4046  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4047  case Intrinsic::nvvm_suld_2d_i8_zero:
4048  case Intrinsic::nvvm_suld_2d_v2i8_zero:
4049  case Intrinsic::nvvm_suld_2d_v4i8_zero:
4050  case Intrinsic::nvvm_suld_2d_array_i8_zero:
4051  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4052  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4053  case Intrinsic::nvvm_suld_3d_i8_zero:
4054  case Intrinsic::nvvm_suld_3d_v2i8_zero:
4055  case Intrinsic::nvvm_suld_3d_v4i8_zero:
4056  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4057  Info.memVT = MVT::i8;
4058  Info.ptrVal = nullptr;
4059  Info.offset = 0;
4061  Info.align = Align(16);
4062  return true;
4063 
4064  case Intrinsic::nvvm_suld_1d_i16_clamp:
4065  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4066  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4067  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4068  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4069  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4070  case Intrinsic::nvvm_suld_2d_i16_clamp:
4071  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4072  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4073  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4074  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4075  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4076  case Intrinsic::nvvm_suld_3d_i16_clamp:
4077  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4078  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4079  case Intrinsic::nvvm_suld_1d_i16_trap:
4080  case Intrinsic::nvvm_suld_1d_v2i16_trap:
4081  case Intrinsic::nvvm_suld_1d_v4i16_trap:
4082  case Intrinsic::nvvm_suld_1d_array_i16_trap:
4083  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4084  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4085  case Intrinsic::nvvm_suld_2d_i16_trap:
4086  case Intrinsic::nvvm_suld_2d_v2i16_trap:
4087  case Intrinsic::nvvm_suld_2d_v4i16_trap:
4088  case Intrinsic::nvvm_suld_2d_array_i16_trap:
4089  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4090  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4091  case Intrinsic::nvvm_suld_3d_i16_trap:
4092  case Intrinsic::nvvm_suld_3d_v2i16_trap:
4093  case Intrinsic::nvvm_suld_3d_v4i16_trap:
4094  case Intrinsic::nvvm_suld_1d_i16_zero:
4095  case Intrinsic::nvvm_suld_1d_v2i16_zero:
4096  case Intrinsic::nvvm_suld_1d_v4i16_zero:
4097  case Intrinsic::nvvm_suld_1d_array_i16_zero:
4098  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4099  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4100  case Intrinsic::nvvm_suld_2d_i16_zero:
4101  case Intrinsic::nvvm_suld_2d_v2i16_zero:
4102  case Intrinsic::nvvm_suld_2d_v4i16_zero:
4103  case Intrinsic::nvvm_suld_2d_array_i16_zero:
4104  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4105  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4106  case Intrinsic::nvvm_suld_3d_i16_zero:
4107  case Intrinsic::nvvm_suld_3d_v2i16_zero:
4108  case Intrinsic::nvvm_suld_3d_v4i16_zero:
4109  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4110  Info.memVT = MVT::i16;
4111  Info.ptrVal = nullptr;
4112  Info.offset = 0;
4114  Info.align = Align(16);
4115  return true;
4116 
4117  case Intrinsic::nvvm_suld_1d_i32_clamp:
4118  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4119  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4120  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4121  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4122  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4123  case Intrinsic::nvvm_suld_2d_i32_clamp:
4124  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4125  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4126  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4127  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4128  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4129  case Intrinsic::nvvm_suld_3d_i32_clamp:
4130  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4131  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4132  case Intrinsic::nvvm_suld_1d_i32_trap:
4133  case Intrinsic::nvvm_suld_1d_v2i32_trap:
4134  case Intrinsic::nvvm_suld_1d_v4i32_trap:
4135  case Intrinsic::nvvm_suld_1d_array_i32_trap:
4136  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4137  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4138  case Intrinsic::nvvm_suld_2d_i32_trap:
4139  case Intrinsic::nvvm_suld_2d_v2i32_trap:
4140  case Intrinsic::nvvm_suld_2d_v4i32_trap:
4141  case Intrinsic::nvvm_suld_2d_array_i32_trap:
4142  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4143  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4144  case Intrinsic::nvvm_suld_3d_i32_trap:
4145  case Intrinsic::nvvm_suld_3d_v2i32_trap:
4146  case Intrinsic::nvvm_suld_3d_v4i32_trap:
4147  case Intrinsic::nvvm_suld_1d_i32_zero:
4148  case Intrinsic::nvvm_suld_1d_v2i32_zero:
4149  case Intrinsic::nvvm_suld_1d_v4i32_zero:
4150  case Intrinsic::nvvm_suld_1d_array_i32_zero:
4151  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4152  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4153  case Intrinsic::nvvm_suld_2d_i32_zero:
4154  case Intrinsic::nvvm_suld_2d_v2i32_zero:
4155  case Intrinsic::nvvm_suld_2d_v4i32_zero:
4156  case Intrinsic::nvvm_suld_2d_array_i32_zero:
4157  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4158  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4159  case Intrinsic::nvvm_suld_3d_i32_zero:
4160  case Intrinsic::nvvm_suld_3d_v2i32_zero:
4161  case Intrinsic::nvvm_suld_3d_v4i32_zero:
4162  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4163  Info.memVT = MVT::i32;
4164  Info.ptrVal = nullptr;
4165  Info.offset = 0;
4167  Info.align = Align(16);
4168  return true;
4169 
4170  case Intrinsic::nvvm_suld_1d_i64_clamp:
4171  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4172  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4173  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4174  case Intrinsic::nvvm_suld_2d_i64_clamp:
4175  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4176  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4177  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4178  case Intrinsic::nvvm_suld_3d_i64_clamp:
4179  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4180  case Intrinsic::nvvm_suld_1d_i64_trap:
4181  case Intrinsic::nvvm_suld_1d_v2i64_trap:
4182  case Intrinsic::nvvm_suld_1d_array_i64_trap:
4183  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4184  case Intrinsic::nvvm_suld_2d_i64_trap:
4185  case Intrinsic::nvvm_suld_2d_v2i64_trap:
4186  case Intrinsic::nvvm_suld_2d_array_i64_trap:
4187  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4188  case Intrinsic::nvvm_suld_3d_i64_trap:
4189  case Intrinsic::nvvm_suld_3d_v2i64_trap:
4190  case Intrinsic::nvvm_suld_1d_i64_zero:
4191  case Intrinsic::nvvm_suld_1d_v2i64_zero:
4192  case Intrinsic::nvvm_suld_1d_array_i64_zero:
4193  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4194  case Intrinsic::nvvm_suld_2d_i64_zero:
4195  case Intrinsic::nvvm_suld_2d_v2i64_zero:
4196  case Intrinsic::nvvm_suld_2d_array_i64_zero:
4197  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4198  case Intrinsic::nvvm_suld_3d_i64_zero:
4199  case Intrinsic::nvvm_suld_3d_v2i64_zero:
4200  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4201  Info.memVT = MVT::i64;
4202  Info.ptrVal = nullptr;
4203  Info.offset = 0;
4205  Info.align = Align(16);
4206  return true;
4207  }
4208  return false;
4209 }
4210 
4211 /// isLegalAddressingMode - Return true if the addressing mode represented
4212 /// by AM is legal for this target, for a load/store of the specified type.
4213 /// Used to guide target specific optimizations, like loop strength reduction
4214 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4215 /// (CodeGenPrepare.cpp)
4217  const AddrMode &AM, Type *Ty,
4218  unsigned AS, Instruction *I) const {
4219  // AddrMode - This represents an addressing mode of:
4220  // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4221  //
4222  // The legal address modes are
4223  // - [avar]
4224  // - [areg]
4225  // - [areg+immoff]
4226  // - [immAddr]
4227 
4228  if (AM.BaseGV) {
4229  return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4230  }
4231 
4232  switch (AM.Scale) {
4233  case 0: // "r", "r+i" or "i" is allowed
4234  break;
4235  case 1:
4236  if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4237  return false;
4238  // Otherwise we have r+i.
4239  break;
4240  default:
4241  // No scale > 1 is allowed
4242  return false;
4243  }
4244  return true;
4245 }
4246 
4247 //===----------------------------------------------------------------------===//
4248 // NVPTX Inline Assembly Support
4249 //===----------------------------------------------------------------------===//
4250 
4251 /// getConstraintType - Given a constraint letter, return the type of
4252 /// constraint it is for this target.
4255  if (Constraint.size() == 1) {
4256  switch (Constraint[0]) {
4257  default:
4258  break;
4259  case 'b':
4260  case 'r':
4261  case 'h':
4262  case 'c':
4263  case 'l':
4264  case 'f':
4265  case 'd':
4266  case '0':
4267  case 'N':
4268  return C_RegisterClass;
4269  }
4270  }
4271  return TargetLowering::getConstraintType(Constraint);
4272 }
4273 
4274 std::pair<unsigned, const TargetRegisterClass *>
4276  StringRef Constraint,
4277  MVT VT) const {
4278  if (Constraint.size() == 1) {
4279  switch (Constraint[0]) {
4280  case 'b':
4281  return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4282  case 'c':
4283  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4284  case 'h':
4285  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4286  case 'r':
4287  return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4288  case 'l':
4289  case 'N':
4290  return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4291  case 'f':
4292  return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4293  case 'd':
4294  return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4295  }
4296  }
4297  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4298 }
4299 
4300 //===----------------------------------------------------------------------===//
4301 // NVPTX DAG Combining
4302 //===----------------------------------------------------------------------===//
4303 
4305  CodeGenOpt::Level OptLevel) const {
4306  // Always honor command-line argument
4307  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4308  return FMAContractLevelOpt > 0;
4309 
4310  // Do not contract if we're not optimizing the code.
4311  if (OptLevel == 0)
4312  return false;
4313 
4314  // Honor TargetOptions flags that explicitly say fusion is okay.
4316  return true;
4317 
4318  return allowUnsafeFPMath(MF);
4319 }
4320 
4322  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4323  if (MF.getTarget().Options.UnsafeFPMath)
4324  return true;
4325 
4326  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4327  const Function &F = MF.getFunction();
4328  if (F.hasFnAttribute("unsafe-fp-math")) {
4329  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4330  StringRef Val = Attr.getValueAsString();
4331  if (Val == "true")
4332  return true;
4333  }
4334 
4335  return false;
4336 }
4337 
4338 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4339 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4340 /// called with the default operands, and if that fails, with commuted
4341 /// operands.
4344  const NVPTXSubtarget &Subtarget,
4345  CodeGenOpt::Level OptLevel) {
4346  SelectionDAG &DAG = DCI.DAG;
4347  // Skip non-integer, non-scalar case
4348  EVT VT=N0.getValueType();
4349  if (VT.isVector())
4350  return SDValue();
4351 
4352  // fold (add (mul a, b), c) -> (mad a, b, c)
4353  //
4354  if (N0.getOpcode() == ISD::MUL) {
4355  assert (VT.isInteger());
4356  // For integer:
4357  // Since integer multiply-add costs the same as integer multiply
4358  // but is more costly than integer add, do the fusion only when
4359  // the mul is only used in the add.
4360  if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4361  !N0.getNode()->hasOneUse())
4362  return SDValue();
4363 
4364  // Do the folding
4365  return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4366  N0.getOperand(0), N0.getOperand(1), N1);
4367  }
4368  else if (N0.getOpcode() == ISD::FMUL) {
4369  if (VT == MVT::f32 || VT == MVT::f64) {
4370  const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4371  &DAG.getTargetLoweringInfo());
4372  if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4373  return SDValue();
4374 
4375  // For floating point:
4376  // Do the fusion only when the mul has less than 5 uses and all
4377  // are add.
4378  // The heuristic is that if a use is not an add, then that use
4379  // cannot be fused into fma, therefore mul is still needed anyway.
4380  // If there are more than 4 uses, even if they are all add, fusing
4381  // them will increase register pressue.
4382  //
4383  int numUses = 0;
4384  int nonAddCount = 0;
4385  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4386  UE = N0.getNode()->use_end();
4387  UI != UE; ++UI) {
4388  numUses++;
4389  SDNode *User = *UI;
4390  if (User->getOpcode() != ISD::FADD)
4391  ++nonAddCount;
4392  }
4393  if (numUses >= 5)
4394  return SDValue();
4395  if (nonAddCount) {
4396  int orderNo = N->getIROrder();
4397  int orderNo2 = N0.getNode()->getIROrder();
4398  // simple heuristics here for considering potential register
4399  // pressure, the logics here is that the differnce are used
4400  // to measure the distance between def and use, the longer distance
4401  // more likely cause register pressure.
4402  if (orderNo - orderNo2 < 500)
4403  return SDValue();
4404 
4405  // Now, check if at least one of the FMUL's operands is live beyond the node N,
4406  // which guarantees that the FMA will not increase register pressure at node N.
4407  bool opIsLive = false;
4408  const SDNode *left = N0.getOperand(0).getNode();
4409  const SDNode *right = N0.getOperand(1).getNode();
4410 
4411  if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4412  opIsLive = true;
4413 
4414  if (!opIsLive)
4415  for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4416  SDNode *User = *UI;
4417  int orderNo3 = User->getIROrder();
4418  if (orderNo3 > orderNo) {
4419  opIsLive = true;
4420  break;
4421  }
4422  }
4423 
4424  if (!opIsLive)
4425  for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4426  SDNode *User = *UI;
4427  int orderNo3 = User->getIROrder();
4428  if (orderNo3 > orderNo) {
4429  opIsLive = true;
4430  break;
4431  }
4432  }
4433 
4434  if (!opIsLive)
4435  return SDValue();
4436  }
4437 
4438  return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4439  N0.getOperand(0), N0.getOperand(1), N1);
4440  }
4441  }
4442 
4443  return SDValue();
4444 }
4445 
4446 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4447 ///
4450  const NVPTXSubtarget &Subtarget,
4451  CodeGenOpt::Level OptLevel) {
4452  SDValue N0 = N->getOperand(0);
4453  SDValue N1 = N->getOperand(1);
4454 
4455  // First try with the default operand order.
4456  if (SDValue Result =
4457  PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4458  return Result;
4459 
4460  // If that didn't work, try again with the operands commuted.
4461  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4462 }
4463 
4466  // The type legalizer turns a vector load of i8 values into a zextload to i16
4467  // registers, optionally ANY_EXTENDs it (if target type is integer),
4468  // and ANDs off the high 8 bits. Since we turn this load into a
4469  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4470  // nodes. Do that here.
4471  SDValue Val = N->getOperand(0);
4472  SDValue Mask = N->getOperand(1);
4473 
4474  if (isa<ConstantSDNode>(Val)) {
4475  std::swap(Val, Mask);
4476  }
4477 
4478  SDValue AExt;
4479  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4480  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4481  AExt = Val;
4482  Val = Val->getOperand(0);
4483  }
4484 
4485  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4486  Val = Val->getOperand(0);
4487  }
4488 
4489  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4490  Val->getOpcode() == NVPTXISD::LoadV4) {
4492  if (!MaskCnst) {
4493  // Not an AND with a constant
4494  return SDValue();
4495  }
4496 
4497  uint64_t MaskVal = MaskCnst->getZExtValue();
4498  if (MaskVal != 0xff) {
4499  // Not an AND that chops off top 8 bits
4500  return SDValue();
4501  }
4502 
4503  MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4504  if (!Mem) {
4505  // Not a MemSDNode?!?
4506  return SDValue();
4507  }
4508 
4509  EVT MemVT = Mem->getMemoryVT();
4510  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4511  // We only handle the i8 case
4512  return SDValue();
4513  }
4514 
4515  unsigned ExtType =
4516  cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4517  getZExtValue();
4518  if (ExtType == ISD::SEXTLOAD) {
4519  // If for some reason the load is a sextload, the and is needed to zero
4520  // out the high 8 bits
4521  return SDValue();
4522  }
4523 
4524  bool AddTo = false;
4525  if (AExt.getNode() != nullptr) {
4526  // Re-insert the ext as a zext.
4527  Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4528  AExt.getValueType(), Val);
4529  AddTo = true;
4530  }
4531 
4532  // If we get here, the AND is unnecessary. Just replace it with the load
4533  DCI.CombineTo(N, Val, AddTo);
4534  }
4535 
4536  return SDValue();
4537 }
4538 
4541  CodeGenOpt::Level OptLevel) {
4542  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4543 
4544  // Don't do anything at less than -O2.
4545  if (OptLevel < CodeGenOpt::Default)
4546  return SDValue();
4547 
4548  SelectionDAG &DAG = DCI.DAG;
4549  SDLoc DL(N);
4550  EVT VT = N->getValueType(0);
4551  bool IsSigned = N->getOpcode() == ISD::SREM;
4552  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4553 
4554  const SDValue &Num = N->getOperand(0);
4555  const SDValue &Den = N->getOperand(1);
4556 
4557  for (const SDNode *U : Num->uses()) {
4558  if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4559  U->getOperand(1) == Den) {
4560  // Num % Den -> Num - (Num / Den) * Den
4561  return DAG.getNode(ISD::SUB, DL, VT, Num,
4562  DAG.getNode(ISD::MUL, DL, VT,
4563  DAG.getNode(DivOpc, DL, VT, Num, Den),
4564  Den));
4565  }
4566  }
4567  return SDValue();
4568 }
4569 
4571  Signed = 0,
4574 };
4575 
4576 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4577 /// that can be demoted to \p OptSize bits without loss of information. The
4578 /// signedness of the operand, if determinable, is placed in \p S.
4580  unsigned OptSize,
4581  OperandSignedness &S) {
4582  S = Unknown;
4583 
4584  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4586  EVT OrigVT = Op.getOperand(0).getValueType();
4587  if (OrigVT.getSizeInBits() <= OptSize) {
4588  S = Signed;
4589  return true;
4590  }
4591  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4592  EVT OrigVT = Op.getOperand(0).getValueType();
4593  if (OrigVT.getSizeInBits() <= OptSize) {
4594  S = Unsigned;
4595  return true;
4596  }
4597  }
4598 
4599  return false;
4600 }
4601 
4602 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4603 /// be demoted to \p OptSize bits without loss of information. If the operands
4604 /// contain a constant, it should appear as the RHS operand. The signedness of
4605 /// the operands is placed in \p IsSigned.
4607  unsigned OptSize,
4608  bool &IsSigned) {
4609  OperandSignedness LHSSign;
4610 
4611  // The LHS operand must be a demotable op
4612  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4613  return false;
4614 
4615  // We should have been able to determine the signedness from the LHS
4616  if (LHSSign == Unknown)
4617  return false;
4618 
4619  IsSigned = (LHSSign == Signed);
4620 
4621  // The RHS can be a demotable op or a constant
4622  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4623  const APInt &Val = CI->getAPIntValue();
4624  if (LHSSign == Unsigned) {
4625  return Val.isIntN(OptSize);
4626  } else {
4627  return Val.isSignedIntN(OptSize);
4628  }
4629  } else {
4630  OperandSignedness RHSSign;
4631  if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4632  return false;
4633 
4634  return LHSSign == RHSSign;
4635  }
4636 }
4637 
4638 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4639 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4640 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4641 /// amount.
4642 static