LLVM  15.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
33 #include "llvm/IR/Argument.h"
34 #include "llvm/IR/Attributes.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/FPEnv.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/IR/GlobalValue.h"
41 #include "llvm/IR/Instruction.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/IntrinsicsNVPTX.h"
44 #include "llvm/IR/Module.h"
45 #include "llvm/IR/Type.h"
46 #include "llvm/IR/Value.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CodeGen.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <sstream>
60 #include <string>
61 #include <utility>
62 #include <vector>
63 
64 #define DEBUG_TYPE "nvptx-lower"
65 
66 using namespace llvm;
67 
68 static std::atomic<unsigned> GlobalUniqueCallSite;
69 
71  "nvptx-sched4reg",
72  cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
73 
74 static cl::opt<unsigned>
76  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
77  " 1: do it 2: do it aggressively"),
78  cl::init(2));
79 
81  "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
82  cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
83  " IEEE Compliant F32 div.rnd if available."),
84  cl::init(2));
85 
87  "nvptx-prec-sqrtf32", cl::Hidden,
88  cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
89  cl::init(true));
90 
92  if (UsePrecDivF32.getNumOccurrences() > 0) {
93  // If nvptx-prec-div32=N is used on the command-line, always honor it
94  return UsePrecDivF32;
95  } else {
96  // Otherwise, use div.approx if fast math is enabled
97  if (getTargetMachine().Options.UnsafeFPMath)
98  return 0;
99  else
100  return 2;
101  }
102 }
103 
105  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
106  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
107  return UsePrecSqrtF32;
108  } else {
109  // Otherwise, use sqrt.approx if fast math is enabled
111  }
112 }
113 
117 }
118 
119 static bool IsPTXVectorType(MVT VT) {
120  switch (VT.SimpleTy) {
121  default:
122  return false;
123  case MVT::v2i1:
124  case MVT::v4i1:
125  case MVT::v2i8:
126  case MVT::v4i8:
127  case MVT::v2i16:
128  case MVT::v4i16:
129  case MVT::v2i32:
130  case MVT::v4i32:
131  case MVT::v2i64:
132  case MVT::v2f16:
133  case MVT::v4f16:
134  case MVT::v8f16: // <4 x f16x2>
135  case MVT::v2f32:
136  case MVT::v4f32:
137  case MVT::v2f64:
138  return true;
139  }
140 }
141 
142 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
143 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
144 /// into their primitive components.
145 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
146 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
147 /// LowerCall, and LowerReturn.
148 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
149  Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
151  uint64_t StartingOffset = 0) {
152  SmallVector<EVT, 16> TempVTs;
153  SmallVector<uint64_t, 16> TempOffsets;
154 
155  // Special case for i128 - decompose to (i64, i64)
156  if (Ty->isIntegerTy(128)) {
157  ValueVTs.push_back(EVT(MVT::i64));
158  ValueVTs.push_back(EVT(MVT::i64));
159 
160  if (Offsets) {
161  Offsets->push_back(StartingOffset + 0);
162  Offsets->push_back(StartingOffset + 8);
163  }
164 
165  return;
166  }
167 
168  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
169  if (StructType *STy = dyn_cast<StructType>(Ty)) {
170  auto const *SL = DL.getStructLayout(STy);
171  auto ElementNum = 0;
172  for(auto *EI : STy->elements()) {
173  ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
174  StartingOffset + SL->getElementOffset(ElementNum));
175  ++ElementNum;
176  }
177  return;
178  }
179 
180  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
181  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
182  EVT VT = TempVTs[i];
183  uint64_t Off = TempOffsets[i];
184  // Split vectors into individual elements, except for v2f16, which
185  // we will pass as a single scalar.
186  if (VT.isVector()) {
187  unsigned NumElts = VT.getVectorNumElements();
188  EVT EltVT = VT.getVectorElementType();
189  // Vectors with an even number of f16 elements will be passed to
190  // us as an array of v2f16 elements. We must match this so we
191  // stay in sync with Ins/Outs.
192  if (EltVT == MVT::f16 && NumElts % 2 == 0) {
193  EltVT = MVT::v2f16;
194  NumElts /= 2;
195  }
196  for (unsigned j = 0; j != NumElts; ++j) {
197  ValueVTs.push_back(EltVT);
198  if (Offsets)
199  Offsets->push_back(Off + j * EltVT.getStoreSize());
200  }
201  } else {
202  ValueVTs.push_back(VT);
203  if (Offsets)
204  Offsets->push_back(Off);
205  }
206  }
207 }
208 
209 // Check whether we can merge loads/stores of some of the pieces of a
210 // flattened function parameter or return value into a single vector
211 // load/store.
212 //
213 // The flattened parameter is represented as a list of EVTs and
214 // offsets, and the whole structure is aligned to ParamAlignment. This
215 // function determines whether we can load/store pieces of the
216 // parameter starting at index Idx using a single vectorized op of
217 // size AccessSize. If so, it returns the number of param pieces
218 // covered by the vector op. Otherwise, it returns 1.
220  unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
221  const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
222 
223  // Can't vectorize if param alignment is not sufficient.
224  if (ParamAlignment < AccessSize)
225  return 1;
226  // Can't vectorize if offset is not aligned.
227  if (Offsets[Idx] & (AccessSize - 1))
228  return 1;
229 
230  EVT EltVT = ValueVTs[Idx];
231  unsigned EltSize = EltVT.getStoreSize();
232 
233  // Element is too large to vectorize.
234  if (EltSize >= AccessSize)
235  return 1;
236 
237  unsigned NumElts = AccessSize / EltSize;
238  // Can't vectorize if AccessBytes if not a multiple of EltSize.
239  if (AccessSize != EltSize * NumElts)
240  return 1;
241 
242  // We don't have enough elements to vectorize.
243  if (Idx + NumElts > ValueVTs.size())
244  return 1;
245 
246  // PTX ISA can only deal with 2- and 4-element vector ops.
247  if (NumElts != 4 && NumElts != 2)
248  return 1;
249 
250  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
251  // Types do not match.
252  if (ValueVTs[j] != EltVT)
253  return 1;
254 
255  // Elements are not contiguous.
256  if (Offsets[j] - Offsets[j - 1] != EltSize)
257  return 1;
258  }
259  // OK. We can vectorize ValueVTs[i..i+NumElts)
260  return NumElts;
261 }
262 
263 // Flags for tracking per-element vectorization state of loads/stores
264 // of a flattened function parameter or return value.
266  PVF_INNER = 0x0, // Middle elements of a vector.
267  PVF_FIRST = 0x1, // First element of the vector.
268  PVF_LAST = 0x2, // Last element of the vector.
269  // Scalar is effectively a 1-element vector.
271 };
272 
273 // Computes whether and how we can vectorize the loads/stores of a
274 // flattened function parameter or return value.
275 //
276 // The flattened parameter is represented as the list of ValueVTs and
277 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
278 // of the same size as ValueVTs indicating how each piece should be
279 // loaded/stored (i.e. as a scalar, or as part of a vector
280 // load/store).
284  Align ParamAlignment) {
285  // Set vector size to match ValueVTs and mark all elements as
286  // scalars by default.
288  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
289 
290  // Check what we can vectorize using 128/64/32-bit accesses.
291  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
292  // Skip elements we've already processed.
293  assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
294  for (unsigned AccessSize : {16, 8, 4, 2}) {
295  unsigned NumElts = CanMergeParamLoadStoresStartingAt(
296  I, AccessSize, ValueVTs, Offsets, ParamAlignment);
297  // Mark vectorized elements.
298  switch (NumElts) {
299  default:
300  llvm_unreachable("Unexpected return value");
301  case 1:
302  // Can't vectorize using this size, try next smaller size.
303  continue;
304  case 2:
305  assert(I + 1 < E && "Not enough elements.");
306  VectorInfo[I] = PVF_FIRST;
307  VectorInfo[I + 1] = PVF_LAST;
308  I += 1;
309  break;
310  case 4:
311  assert(I + 3 < E && "Not enough elements.");
312  VectorInfo[I] = PVF_FIRST;
313  VectorInfo[I + 1] = PVF_INNER;
314  VectorInfo[I + 2] = PVF_INNER;
315  VectorInfo[I + 3] = PVF_LAST;
316  I += 3;
317  break;
318  }
319  // Break out of the inner loop because we've already succeeded
320  // using largest possible AccessSize.
321  break;
322  }
323  }
324  return VectorInfo;
325 }
326 
327 // NVPTXTargetLowering Constructor.
329  const NVPTXSubtarget &STI)
330  : TargetLowering(TM), nvTM(&TM), STI(STI) {
331  // always lower memset, memcpy, and memmove intrinsics to load/store
332  // instructions, rather
333  // then generating calls to memset, mempcy or memmove.
334  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
335  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
336  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
337 
340 
341  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
342  // condition branches.
343  setJumpIsExpensive(true);
344 
345  // Wide divides are _very_ slow. Try to reduce the width of the divide if
346  // possible.
347  addBypassSlowDiv(64, 32);
348 
349  // By default, use the Source scheduling
350  if (sched4reg)
352  else
354 
355  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
356  LegalizeAction NoF16Action) {
357  setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
358  };
359 
360  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
361  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
362  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
363  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
364  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
365  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
366  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
367  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
368 
369  // Conversion to/from FP16/FP16x2 is always legal.
376 
377  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
378  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
379 
380  // Operations not directly supported by NVPTX.
385  }
386 
387  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
388  // For others we will expand to a SHL/SRA pair.
394 
401 
404 
405  // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
406  // that don't have h/w rotation we lower them to multi-instruction assembly.
407  // See ROT*_sw in NVPTXIntrInfo.td
412 
420 
421  // Indirect branch is not supported.
422  // This also disables Jump Table creation.
425 
428 
429  // We want to legalize constant related memmove and memcopy
430  // intrinsics.
432 
433  // Turn FP extload into load/fpextend
443  // Turn FP truncstore into trunc + store.
444  // FIXME: vector types should also be expanded
448 
449  // PTX does not support load / store predicate registers
452 
453  for (MVT VT : MVT::integer_valuetypes()) {
457  }
458 
459  // This is legal in NVPTX
463 
464  // TRAP can be lowered to PTX trap
466 
467  // Register custom handling for vector loads/stores
468  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
469  if (IsPTXVectorType(VT)) {
473  }
474  }
475 
476  // Custom handling for i8 intrinsics
478 
479  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
485 
488  }
489 
494  if (STI.getPTXVersion() >= 43) {
499  }
500 
504 
505  // PTX does not directly support SELP of i1, so promote to i32 first
507 
508  // PTX cannot multiply two i64s in a single instruction.
511 
512  // We have some custom DAG combine patterns for these nodes
514  ISD::SREM, ISD::UREM});
515 
516  // setcc for f16x2 needs special handling to prevent legalizer's
517  // attempt to scalarize it due to v2i1 not being legal.
518  if (STI.allowFP16Math())
520 
521  // Promote fp16 arithmetic if fp16 hardware isn't available or the
522  // user passed --nvptx-no-fp16-math. The flag is useful because,
523  // although sm_53+ GPUs have some sort of FP16 support in
524  // hardware, only sm_53 and sm_60 have full implementation. Others
525  // only have token amount of hardware and are likely to run faster
526  // by using fp32 units instead.
527  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
528  setFP16OperationAction(Op, MVT::f16, Legal, Promote);
529  setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
530  }
531 
532  // There's no neg.f16 instruction. Expand to (0-x).
535 
536  // (would be) Library functions.
537 
538  // These map to conversion instructions for scalar FP types.
539  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
540  ISD::FTRUNC}) {
545  }
546 
551 
552 
553  // 'Expand' implements FCOPYSIGN without calling an external library.
558 
559  // These map to corresponding instructions for f32/f64. f16 must be
560  // promoted to f32. v2f16 is expanded to f16, which is then promoted
561  // to f32.
562  for (const auto &Op :
568  }
569  // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
570  auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
571  bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
572  return IsAtLeastSm80 ? Legal : NotSm80Action;
573  };
574  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
575  setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
578  setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
579  }
580  for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
581  setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
582  setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
583  setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
584  }
585 
586  // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
587  // No FPOW or FREM in PTX.
588 
589  // Now deduce the information based on the above mentioned
590  // actions
592 
594 }
595 
596 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
597  switch ((NVPTXISD::NodeType)Opcode) {
599  break;
600  case NVPTXISD::CALL:
601  return "NVPTXISD::CALL";
602  case NVPTXISD::RET_FLAG:
603  return "NVPTXISD::RET_FLAG";
605  return "NVPTXISD::LOAD_PARAM";
606  case NVPTXISD::Wrapper:
607  return "NVPTXISD::Wrapper";
609  return "NVPTXISD::DeclareParam";
611  return "NVPTXISD::DeclareScalarParam";
613  return "NVPTXISD::DeclareRet";
615  return "NVPTXISD::DeclareScalarRet";
617  return "NVPTXISD::DeclareRetParam";
618  case NVPTXISD::PrintCall:
619  return "NVPTXISD::PrintCall";
621  return "NVPTXISD::PrintConvergentCall";
623  return "NVPTXISD::PrintCallUni";
625  return "NVPTXISD::PrintConvergentCallUni";
626  case NVPTXISD::LoadParam:
627  return "NVPTXISD::LoadParam";
629  return "NVPTXISD::LoadParamV2";
631  return "NVPTXISD::LoadParamV4";
633  return "NVPTXISD::StoreParam";
635  return "NVPTXISD::StoreParamV2";
637  return "NVPTXISD::StoreParamV4";
639  return "NVPTXISD::StoreParamS32";
641  return "NVPTXISD::StoreParamU32";
643  return "NVPTXISD::CallArgBegin";
644  case NVPTXISD::CallArg:
645  return "NVPTXISD::CallArg";
647  return "NVPTXISD::LastCallArg";
649  return "NVPTXISD::CallArgEnd";
650  case NVPTXISD::CallVoid:
651  return "NVPTXISD::CallVoid";
652  case NVPTXISD::CallVal:
653  return "NVPTXISD::CallVal";
655  return "NVPTXISD::CallSymbol";
656  case NVPTXISD::Prototype:
657  return "NVPTXISD::Prototype";
658  case NVPTXISD::MoveParam:
659  return "NVPTXISD::MoveParam";
661  return "NVPTXISD::StoreRetval";
663  return "NVPTXISD::StoreRetvalV2";
665  return "NVPTXISD::StoreRetvalV4";
667  return "NVPTXISD::PseudoUseParam";
668  case NVPTXISD::RETURN:
669  return "NVPTXISD::RETURN";
671  return "NVPTXISD::CallSeqBegin";
673  return "NVPTXISD::CallSeqEnd";
675  return "NVPTXISD::CallPrototype";
676  case NVPTXISD::ProxyReg:
677  return "NVPTXISD::ProxyReg";
678  case NVPTXISD::LoadV2:
679  return "NVPTXISD::LoadV2";
680  case NVPTXISD::LoadV4:
681  return "NVPTXISD::LoadV4";
682  case NVPTXISD::LDGV2:
683  return "NVPTXISD::LDGV2";
684  case NVPTXISD::LDGV4:
685  return "NVPTXISD::LDGV4";
686  case NVPTXISD::LDUV2:
687  return "NVPTXISD::LDUV2";
688  case NVPTXISD::LDUV4:
689  return "NVPTXISD::LDUV4";
690  case NVPTXISD::StoreV2:
691  return "NVPTXISD::StoreV2";
692  case NVPTXISD::StoreV4:
693  return "NVPTXISD::StoreV4";
695  return "NVPTXISD::FUN_SHFL_CLAMP";
697  return "NVPTXISD::FUN_SHFR_CLAMP";
698  case NVPTXISD::IMAD:
699  return "NVPTXISD::IMAD";
701  return "NVPTXISD::SETP_F16X2";
702  case NVPTXISD::Dummy:
703  return "NVPTXISD::Dummy";
705  return "NVPTXISD::MUL_WIDE_SIGNED";
707  return "NVPTXISD::MUL_WIDE_UNSIGNED";
708  case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
709  case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
711  return "NVPTXISD::Tex1DFloatFloatLevel";
713  return "NVPTXISD::Tex1DFloatFloatGrad";
714  case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
715  case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
717  return "NVPTXISD::Tex1DS32FloatLevel";
719  return "NVPTXISD::Tex1DS32FloatGrad";
720  case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
721  case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
723  return "NVPTXISD::Tex1DU32FloatLevel";
725  return "NVPTXISD::Tex1DU32FloatGrad";
726  case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
727  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
729  return "NVPTXISD::Tex1DArrayFloatFloatLevel";
731  return "NVPTXISD::Tex1DArrayFloatFloatGrad";
732  case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
733  case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
735  return "NVPTXISD::Tex1DArrayS32FloatLevel";
737  return "NVPTXISD::Tex1DArrayS32FloatGrad";
738  case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
739  case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
741  return "NVPTXISD::Tex1DArrayU32FloatLevel";
743  return "NVPTXISD::Tex1DArrayU32FloatGrad";
744  case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
745  case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
747  return "NVPTXISD::Tex2DFloatFloatLevel";
749  return "NVPTXISD::Tex2DFloatFloatGrad";
750  case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
751  case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
753  return "NVPTXISD::Tex2DS32FloatLevel";
755  return "NVPTXISD::Tex2DS32FloatGrad";
756  case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
757  case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
759  return "NVPTXISD::Tex2DU32FloatLevel";
761  return "NVPTXISD::Tex2DU32FloatGrad";
762  case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
763  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
765  return "NVPTXISD::Tex2DArrayFloatFloatLevel";
767  return "NVPTXISD::Tex2DArrayFloatFloatGrad";
768  case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
769  case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
771  return "NVPTXISD::Tex2DArrayS32FloatLevel";
773  return "NVPTXISD::Tex2DArrayS32FloatGrad";
774  case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
775  case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
777  return "NVPTXISD::Tex2DArrayU32FloatLevel";
779  return "NVPTXISD::Tex2DArrayU32FloatGrad";
780  case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
781  case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
783  return "NVPTXISD::Tex3DFloatFloatLevel";
785  return "NVPTXISD::Tex3DFloatFloatGrad";
786  case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
787  case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
789  return "NVPTXISD::Tex3DS32FloatLevel";
791  return "NVPTXISD::Tex3DS32FloatGrad";
792  case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
793  case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
795  return "NVPTXISD::Tex3DU32FloatLevel";
797  return "NVPTXISD::Tex3DU32FloatGrad";
798  case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
800  return "NVPTXISD::TexCubeFloatFloatLevel";
801  case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
803  return "NVPTXISD::TexCubeS32FloatLevel";
804  case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
806  return "NVPTXISD::TexCubeU32FloatLevel";
808  return "NVPTXISD::TexCubeArrayFloatFloat";
810  return "NVPTXISD::TexCubeArrayFloatFloatLevel";
812  return "NVPTXISD::TexCubeArrayS32Float";
814  return "NVPTXISD::TexCubeArrayS32FloatLevel";
816  return "NVPTXISD::TexCubeArrayU32Float";
818  return "NVPTXISD::TexCubeArrayU32FloatLevel";
820  return "NVPTXISD::Tld4R2DFloatFloat";
822  return "NVPTXISD::Tld4G2DFloatFloat";
824  return "NVPTXISD::Tld4B2DFloatFloat";
826  return "NVPTXISD::Tld4A2DFloatFloat";
828  return "NVPTXISD::Tld4R2DS64Float";
830  return "NVPTXISD::Tld4G2DS64Float";
832  return "NVPTXISD::Tld4B2DS64Float";
834  return "NVPTXISD::Tld4A2DS64Float";
836  return "NVPTXISD::Tld4R2DU64Float";
838  return "NVPTXISD::Tld4G2DU64Float";
840  return "NVPTXISD::Tld4B2DU64Float";
842  return "NVPTXISD::Tld4A2DU64Float";
843 
845  return "NVPTXISD::TexUnified1DFloatS32";
847  return "NVPTXISD::TexUnified1DFloatFloat";
849  return "NVPTXISD::TexUnified1DFloatFloatLevel";
851  return "NVPTXISD::TexUnified1DFloatFloatGrad";
853  return "NVPTXISD::TexUnified1DS32S32";
855  return "NVPTXISD::TexUnified1DS32Float";
857  return "NVPTXISD::TexUnified1DS32FloatLevel";
859  return "NVPTXISD::TexUnified1DS32FloatGrad";
861  return "NVPTXISD::TexUnified1DU32S32";
863  return "NVPTXISD::TexUnified1DU32Float";
865  return "NVPTXISD::TexUnified1DU32FloatLevel";
867  return "NVPTXISD::TexUnified1DU32FloatGrad";
869  return "NVPTXISD::TexUnified1DArrayFloatS32";
871  return "NVPTXISD::TexUnified1DArrayFloatFloat";
873  return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
875  return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
877  return "NVPTXISD::TexUnified1DArrayS32S32";
879  return "NVPTXISD::TexUnified1DArrayS32Float";
881  return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
883  return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
885  return "NVPTXISD::TexUnified1DArrayU32S32";
887  return "NVPTXISD::TexUnified1DArrayU32Float";
889  return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
891  return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
893  return "NVPTXISD::TexUnified2DFloatS32";
895  return "NVPTXISD::TexUnified2DFloatFloat";
897  return "NVPTXISD::TexUnified2DFloatFloatLevel";
899  return "NVPTXISD::TexUnified2DFloatFloatGrad";
901  return "NVPTXISD::TexUnified2DS32S32";
903  return "NVPTXISD::TexUnified2DS32Float";
905  return "NVPTXISD::TexUnified2DS32FloatLevel";
907  return "NVPTXISD::TexUnified2DS32FloatGrad";
909  return "NVPTXISD::TexUnified2DU32S32";
911  return "NVPTXISD::TexUnified2DU32Float";
913  return "NVPTXISD::TexUnified2DU32FloatLevel";
915  return "NVPTXISD::TexUnified2DU32FloatGrad";
917  return "NVPTXISD::TexUnified2DArrayFloatS32";
919  return "NVPTXISD::TexUnified2DArrayFloatFloat";
921  return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
923  return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
925  return "NVPTXISD::TexUnified2DArrayS32S32";
927  return "NVPTXISD::TexUnified2DArrayS32Float";
929  return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
931  return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
933  return "NVPTXISD::TexUnified2DArrayU32S32";
935  return "NVPTXISD::TexUnified2DArrayU32Float";
937  return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
939  return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
941  return "NVPTXISD::TexUnified3DFloatS32";
943  return "NVPTXISD::TexUnified3DFloatFloat";
945  return "NVPTXISD::TexUnified3DFloatFloatLevel";
947  return "NVPTXISD::TexUnified3DFloatFloatGrad";
949  return "NVPTXISD::TexUnified3DS32S32";
951  return "NVPTXISD::TexUnified3DS32Float";
953  return "NVPTXISD::TexUnified3DS32FloatLevel";
955  return "NVPTXISD::TexUnified3DS32FloatGrad";
957  return "NVPTXISD::TexUnified3DU32S32";
959  return "NVPTXISD::TexUnified3DU32Float";
961  return "NVPTXISD::TexUnified3DU32FloatLevel";
963  return "NVPTXISD::TexUnified3DU32FloatGrad";
965  return "NVPTXISD::TexUnifiedCubeFloatFloat";
967  return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
969  return "NVPTXISD::TexUnifiedCubeS32Float";
971  return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
973  return "NVPTXISD::TexUnifiedCubeU32Float";
975  return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
977  return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
979  return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
981  return "NVPTXISD::TexUnifiedCubeArrayS32Float";
983  return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
985  return "NVPTXISD::TexUnifiedCubeArrayU32Float";
987  return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
989  return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
991  return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
993  return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
995  return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
997  return "NVPTXISD::Tld4UnifiedR2DS64Float";
999  return "NVPTXISD::Tld4UnifiedG2DS64Float";
1001  return "NVPTXISD::Tld4UnifiedB2DS64Float";
1003  return "NVPTXISD::Tld4UnifiedA2DS64Float";
1005  return "NVPTXISD::Tld4UnifiedR2DU64Float";
1007  return "NVPTXISD::Tld4UnifiedG2DU64Float";
1009  return "NVPTXISD::Tld4UnifiedB2DU64Float";
1011  return "NVPTXISD::Tld4UnifiedA2DU64Float";
1012 
1013  case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1014  case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1015  case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1016  case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1017  case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1018  case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1019  case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1020  case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1021  case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1022  case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1023  case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1024 
1025  case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1026  case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1027  case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1028  case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1029  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1030  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1031  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1032  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1033  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1034  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1035  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1036 
1037  case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1038  case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1039  case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1040  case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1041  case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1042  case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1043  case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1044  case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1045  case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1046  case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1047  case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1048 
1049  case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1050  case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1051  case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1052  case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1053  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1054  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1055  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1056  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1057  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1058  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1059  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1060 
1061  case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1062  case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1063  case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1064  case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1065  case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1066  case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1067  case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1068  case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1069  case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1070  case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1071  case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1072 
1073  case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1074  case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1075  case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1076  case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1077  case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1078  case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1079  case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1080  case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1081  case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1082  case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1083  case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1084 
1085  case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1086  case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1087  case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1088  case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1089  case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1090  case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1091  case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1092  case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1093  case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1094  case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1095  case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1096 
1097  case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1098  case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1099  case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1100  case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1101  case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1102  case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1103  case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1104  case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1105  case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1106  case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1107  case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1108 
1109  case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1110  case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1111  case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1112  case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1113  case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1114  case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1115  case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1116  case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1117  case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1118  case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1119  case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1120 
1121  case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1122  case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1123  case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1124  case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1125  case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1126  case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1127  case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1128  case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1129  case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1130  case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1131  case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1132 
1133  case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1134  case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1135  case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1136  case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1137  case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1138  case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1139  case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1140  case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1141  case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1142  case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1143  case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1144 
1145  case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1146  case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1147  case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1148  case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1149  case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1150  case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1151  case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1152  case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1153  case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1154  case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1155  case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1156 
1157  case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1158  case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1159  case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1160  case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1161  case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1162  case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1163  case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1164  case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1165  case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1166  case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1167  case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1168 
1169  case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1170  case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1171  case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1172  case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1173  case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1174  case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1175  case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1176  case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1177  case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1178  case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1179  case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1180 
1181  case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1182  case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1183  case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1184  case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1185  case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1186  case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1187  case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1188  case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1189  case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1190  case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1191  case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1192  }
1193  return nullptr;
1194 }
1195 
1198  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1199  VT.getScalarType() == MVT::i1)
1200  return TypeSplitVector;
1201  if (VT == MVT::v2f16)
1202  return TypeLegal;
1204 }
1205 
1207  int Enabled, int &ExtraSteps,
1208  bool &UseOneConst,
1209  bool Reciprocal) const {
1211  (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1212  return SDValue();
1213 
1214  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1215  ExtraSteps = 0;
1216 
1217  SDLoc DL(Operand);
1218  EVT VT = Operand.getValueType();
1219  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1220 
1221  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1222  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1223  DAG.getConstant(IID, DL, MVT::i32), Operand);
1224  };
1225 
1226  // The sqrt and rsqrt refinement processes assume we always start out with an
1227  // approximation of the rsqrt. Therefore, if we're going to do any refinement
1228  // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1229  // any refinement, we must return a regular sqrt.
1230  if (Reciprocal || ExtraSteps > 0) {
1231  if (VT == MVT::f32)
1232  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1233  : Intrinsic::nvvm_rsqrt_approx_f);
1234  else if (VT == MVT::f64)
1235  return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1236  else
1237  return SDValue();
1238  } else {
1239  if (VT == MVT::f32)
1240  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1241  : Intrinsic::nvvm_sqrt_approx_f);
1242  else {
1243  // There's no sqrt.approx.f64 instruction, so we emit
1244  // reciprocal(rsqrt(x)). This is faster than
1245  // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1246  // x * rsqrt(x).)
1247  return DAG.getNode(
1249  DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1250  MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1251  }
1252  }
1253 }
1254 
1255 SDValue
1257  SDLoc dl(Op);
1258  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1259  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1260  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1261  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1262 }
1263 
1265  const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1266  const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1267  const CallBase &CB, unsigned UniqueCallSite) const {
1268  auto PtrVT = getPointerTy(DL);
1269 
1270  bool isABI = (STI.getSmVersion() >= 20);
1271  assert(isABI && "Non-ABI compilation is not supported");
1272  if (!isABI)
1273  return "";
1274 
1275  std::stringstream O;
1276  O << "prototype_" << UniqueCallSite << " : .callprototype ";
1277 
1278  if (retTy->getTypeID() == Type::VoidTyID) {
1279  O << "()";
1280  } else {
1281  O << "(";
1282  if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1283  unsigned size = 0;
1284  if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1285  size = ITy->getBitWidth();
1286  } else {
1287  assert(retTy->isFloatingPointTy() &&
1288  "Floating point type expected here");
1289  size = retTy->getPrimitiveSizeInBits();
1290  }
1291  // PTX ABI requires all scalar return values to be at least 32
1292  // bits in size. fp16 normally uses .b16 as its storage type in
1293  // PTX, so its size must be adjusted here, too.
1294  if (size < 32)
1295  size = 32;
1296 
1297  O << ".param .b" << size << " _";
1298  } else if (isa<PointerType>(retTy)) {
1299  O << ".param .b" << PtrVT.getSizeInBits() << " _";
1300  } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
1301  retTy->isIntegerTy(128)) {
1302  O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1303  << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1304  } else {
1305  llvm_unreachable("Unknown return type");
1306  }
1307  O << ") ";
1308  }
1309  O << "_ (";
1310 
1311  bool first = true;
1312 
1313  const Function *F = CB.getFunction();
1314  for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) {
1315  Type *Ty = Args[i].Ty;
1316  if (!first) {
1317  O << ", ";
1318  }
1319  first = false;
1320 
1321  if (!Outs[OIdx].Flags.isByVal()) {
1322  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1323  unsigned ParamAlign = 0;
1324  const CallInst *CallI = cast<CallInst>(&CB);
1325  // +1 because index 0 is reserved for return type alignment
1326  if (!getAlign(*CallI, i + 1, ParamAlign))
1327  ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1328  O << ".param .align " << ParamAlign << " .b8 ";
1329  O << "_";
1330  O << "[" << DL.getTypeAllocSize(Ty) << "]";
1331  // update the index for Outs
1332  SmallVector<EVT, 16> vtparts;
1333  ComputeValueVTs(*this, DL, Ty, vtparts);
1334  if (unsigned len = vtparts.size())
1335  OIdx += len - 1;
1336  continue;
1337  }
1338  // i8 types in IR will be i16 types in SDAG
1339  assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1340  (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1341  "type mismatch between callee prototype and arguments");
1342  // scalar type
1343  unsigned sz = 0;
1344  if (isa<IntegerType>(Ty)) {
1345  sz = cast<IntegerType>(Ty)->getBitWidth();
1346  if (sz < 32)
1347  sz = 32;
1348  } else if (isa<PointerType>(Ty)) {
1349  sz = PtrVT.getSizeInBits();
1350  } else if (Ty->isHalfTy())
1351  // PTX ABI requires all scalar parameters to be at least 32
1352  // bits in size. fp16 normally uses .b16 as its storage type
1353  // in PTX, so its size must be adjusted here, too.
1354  sz = 32;
1355  else
1356  sz = Ty->getPrimitiveSizeInBits();
1357  O << ".param .b" << sz << " ";
1358  O << "_";
1359  continue;
1360  }
1361 
1362  Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1363 
1364  // Try to increase alignment. This code matches logic in LowerCall when
1365  // alignment increase is performed to increase vectorization options.
1366  Type *ETy = Args[i].IndirectType;
1367  Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
1368  ParamByValAlign = std::max(ParamByValAlign, AlignCandidate);
1369 
1370  O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1371  O << "_";
1372  O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1373  }
1374  O << ");";
1375  return O.str();
1376 }
1377 
1378 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1379  const CallBase *CB, Type *Ty,
1380  unsigned Idx,
1381  const DataLayout &DL) const {
1382  if (!CB) {
1383  // CallSite is zero, fallback to ABI type alignment
1384  return DL.getABITypeAlign(Ty);
1385  }
1386 
1387  unsigned Alignment = 0;
1388  const Function *DirectCallee = CB->getCalledFunction();
1389 
1390  if (!DirectCallee) {
1391  // We don't have a direct function symbol, but that may be because of
1392  // constant cast instructions in the call.
1393 
1394  // With bitcast'd call targets, the instruction will be the call
1395  if (const auto *CI = dyn_cast<CallInst>(CB)) {
1396  // Check if we have call alignment metadata
1397  if (getAlign(*CI, Idx, Alignment))
1398  return Align(Alignment);
1399 
1400  const Value *CalleeV = CI->getCalledOperand();
1401  // Ignore any bitcast instructions
1402  while (isa<ConstantExpr>(CalleeV)) {
1403  const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1404  if (!CE->isCast())
1405  break;
1406  // Look through the bitcast
1407  CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1408  }
1409 
1410  // We have now looked past all of the bitcasts. Do we finally have a
1411  // Function?
1412  if (const auto *CalleeF = dyn_cast<Function>(CalleeV))
1413  DirectCallee = CalleeF;
1414  }
1415  }
1416 
1417  // Check for function alignment information if we found that the
1418  // ultimate target is a Function
1419  if (DirectCallee) {
1420  if (getAlign(*DirectCallee, Idx, Alignment))
1421  return Align(Alignment);
1422  // If alignment information is not available, fall back to the
1423  // default function param optimized type alignment
1424  return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1425  }
1426 
1427  // Call is indirect, fall back to the ABI type alignment
1428  return DL.getABITypeAlign(Ty);
1429 }
1430 
1432  SmallVectorImpl<SDValue> &InVals) const {
1433  SelectionDAG &DAG = CLI.DAG;
1434  SDLoc dl = CLI.DL;
1436  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1438  SDValue Chain = CLI.Chain;
1439  SDValue Callee = CLI.Callee;
1440  bool &isTailCall = CLI.IsTailCall;
1441  ArgListTy &Args = CLI.getArgs();
1442  Type *RetTy = CLI.RetTy;
1443  const CallBase *CB = CLI.CB;
1444  const DataLayout &DL = DAG.getDataLayout();
1445 
1446  bool isABI = (STI.getSmVersion() >= 20);
1447  assert(isABI && "Non-ABI compilation is not supported");
1448  if (!isABI)
1449  return Chain;
1450 
1451  unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1452  SDValue TempChain = Chain;
1453  Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1454  SDValue InFlag = Chain.getValue(1);
1455 
1456  unsigned ParamCount = 0;
1457  // Args.size() and Outs.size() need not match.
1458  // Outs.size() will be larger
1459  // * if there is an aggregate argument with multiple fields (each field
1460  // showing up separately in Outs)
1461  // * if there is a vector argument with more than typical vector-length
1462  // elements (generally if more than 4) where each vector element is
1463  // individually present in Outs.
1464  // So a different index should be used for indexing into Outs/OutVals.
1465  // See similar issue in LowerFormalArguments.
1466  unsigned OIdx = 0;
1467  // Declare the .params or .reg need to pass values
1468  // to the function
1469  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1470  EVT VT = Outs[OIdx].VT;
1471  Type *Ty = Args[i].Ty;
1472  bool IsByVal = Outs[OIdx].Flags.isByVal();
1473 
1476 
1477  assert((!IsByVal || Args[i].IndirectType) &&
1478  "byval arg must have indirect type");
1479  Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1480  ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets);
1481 
1482  Align ArgAlign;
1483  if (IsByVal) {
1484  // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1485  // so we don't need to worry whether it's naturally aligned or not.
1486  // See TargetLowering::LowerCallTo().
1487  ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1488 
1489  // Try to increase alignment to enhance vectorization options.
1490  ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(
1491  CB->getCalledFunction(), ETy, DL));
1492 
1493  // Enforce minumum alignment of 4 to work around ptxas miscompile
1494  // for sm_50+. See corresponding alignment adjustment in
1495  // emitFunctionParamList() for details.
1496  ArgAlign = std::max(ArgAlign, Align(4));
1497  } else {
1498  ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
1499  }
1500 
1501  unsigned TypeSize =
1502  (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1503  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1504 
1505  bool NeedAlign; // Does argument declaration specify alignment?
1506  if (IsByVal ||
1507  (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128))) {
1508  // declare .param .align <align> .b8 .param<n>[<size>];
1509  SDValue DeclareParamOps[] = {
1510  Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1511  DAG.getConstant(ParamCount, dl, MVT::i32),
1512  DAG.getConstant(TypeSize, dl, MVT::i32), InFlag};
1513  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1514  DeclareParamOps);
1515  NeedAlign = true;
1516  } else {
1517  // declare .param .b<size> .param<n>;
1518  if ((VT.isInteger() || VT.isFloatingPoint()) && TypeSize < 4) {
1519  // PTX ABI requires integral types to be at least 32 bits in
1520  // size. FP16 is loaded/stored using i16, so it's handled
1521  // here as well.
1522  TypeSize = 4;
1523  }
1524  SDValue DeclareScalarParamOps[] = {
1525  Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1526  DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1527  DAG.getConstant(0, dl, MVT::i32), InFlag};
1528  Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1529  DeclareScalarParamOps);
1530  NeedAlign = false;
1531  }
1532  InFlag = Chain.getValue(1);
1533 
1534  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1535  // than 32-bits are sign extended or zero extended, depending on
1536  // whether they are signed or unsigned types. This case applies
1537  // only to scalar parameters and not to aggregate values.
1538  bool ExtendIntegerParam =
1539  Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1540 
1541  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1542  SmallVector<SDValue, 6> StoreOperands;
1543  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1544  EVT EltVT = VTs[j];
1545  int CurOffset = Offsets[j];
1546  MaybeAlign PartAlign;
1547  if (NeedAlign)
1548  PartAlign = commonAlignment(ArgAlign, CurOffset);
1549 
1550  // New store.
1551  if (VectorInfo[j] & PVF_FIRST) {
1552  assert(StoreOperands.empty() && "Unfinished preceding store.");
1553  StoreOperands.push_back(Chain);
1554  StoreOperands.push_back(DAG.getConstant(ParamCount, dl, MVT::i32));
1555  StoreOperands.push_back(DAG.getConstant(CurOffset, dl, MVT::i32));
1556  }
1557 
1558  SDValue StVal = OutVals[OIdx];
1559  if (IsByVal) {
1560  auto PtrVT = getPointerTy(DL);
1561  SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1562  DAG.getConstant(CurOffset, dl, PtrVT));
1563  StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1564  PartAlign);
1565  } else if (ExtendIntegerParam) {
1566  assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1567  // zext/sext to i32
1568  StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1569  : ISD::ZERO_EXTEND,
1570  dl, MVT::i32, StVal);
1571  }
1572 
1573  if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1574  // Use 16-bit registers for small stores as it's the
1575  // smallest general purpose register size supported by NVPTX.
1576  StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1577  }
1578 
1579  // Record the value to store.
1580  StoreOperands.push_back(StVal);
1581 
1582  if (VectorInfo[j] & PVF_LAST) {
1583  unsigned NumElts = StoreOperands.size() - 3;
1585  switch (NumElts) {
1586  case 1:
1588  break;
1589  case 2:
1591  break;
1592  case 4:
1594  break;
1595  default:
1596  llvm_unreachable("Invalid vector info.");
1597  }
1598 
1599  StoreOperands.push_back(InFlag);
1600 
1601  // Adjust type of the store op if we've extended the scalar
1602  // return value.
1603  EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1604 
1605  Chain = DAG.getMemIntrinsicNode(
1606  Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1607  TheStoreType, MachinePointerInfo(), PartAlign,
1609  InFlag = Chain.getValue(1);
1610 
1611  // Cleanup.
1612  StoreOperands.clear();
1613  }
1614  if (!IsByVal)
1615  ++OIdx;
1616  }
1617  assert(StoreOperands.empty() && "Unfinished parameter store.");
1618  if (!IsByVal && VTs.size() > 0)
1619  --OIdx;
1620  ++ParamCount;
1621  }
1622 
1623  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1624  MaybeAlign retAlignment = None;
1625 
1626  // Handle Result
1627  if (Ins.size() > 0) {
1628  SmallVector<EVT, 16> resvtparts;
1629  ComputeValueVTs(*this, DL, RetTy, resvtparts);
1630 
1631  // Declare
1632  // .param .align 16 .b8 retval0[<size-in-bytes>], or
1633  // .param .b<size-in-bits> retval0
1634  unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1635  // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1636  // these three types to match the logic in
1637  // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1638  // Plus, this behavior is consistent with nvcc's.
1639  if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1640  (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1641  // Scalar needs to be at least 32bit wide
1642  if (resultsz < 32)
1643  resultsz = 32;
1644  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1645  SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1646  DAG.getConstant(resultsz, dl, MVT::i32),
1647  DAG.getConstant(0, dl, MVT::i32), InFlag };
1648  Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1649  DeclareRetOps);
1650  InFlag = Chain.getValue(1);
1651  } else {
1652  retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1653  assert(retAlignment && "retAlignment is guaranteed to be set");
1654  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1655  SDValue DeclareRetOps[] = {
1656  Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1657  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1658  DAG.getConstant(0, dl, MVT::i32), InFlag};
1659  Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1660  DeclareRetOps);
1661  InFlag = Chain.getValue(1);
1662  }
1663  }
1664 
1665  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1666  // between them we must rely on the call site value which is valid for
1667  // indirect calls but is always null for libcalls.
1668  bool isIndirectCall = !Func && CB;
1669 
1670  if (isa<ExternalSymbolSDNode>(Callee)) {
1671  Function* CalleeFunc = nullptr;
1672 
1673  // Try to find the callee in the current module.
1674  Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1675  assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1676 
1677  // Set the "libcall callee" attribute to indicate that the function
1678  // must always have a declaration.
1679  CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1680  }
1681 
1682  if (isIndirectCall) {
1683  // This is indirect function call case : PTX requires a prototype of the
1684  // form
1685  // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1686  // to be emitted, and the label has to used as the last arg of call
1687  // instruction.
1688  // The prototype is embedded in a string and put as the operand for a
1689  // CallPrototype SDNode which will print out to the value of the string.
1690  SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1691  std::string Proto =
1692  getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite);
1693  const char *ProtoStr =
1694  nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1695  SDValue ProtoOps[] = {
1696  Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1697  };
1698  Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1699  InFlag = Chain.getValue(1);
1700  }
1701  // Op to just print "call"
1702  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1703  SDValue PrintCallOps[] = {
1704  Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1705  };
1706  // We model convergent calls as separate opcodes.
1708  if (CLI.IsConvergent)
1711  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1712  InFlag = Chain.getValue(1);
1713 
1714  // Ops to print out the function name
1715  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1716  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1717  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1718  InFlag = Chain.getValue(1);
1719 
1720  // Ops to print out the param list
1721  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1722  SDValue CallArgBeginOps[] = { Chain, InFlag };
1723  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1724  CallArgBeginOps);
1725  InFlag = Chain.getValue(1);
1726 
1727  for (unsigned i = 0, e = ParamCount; i != e; ++i) {
1728  unsigned opcode;
1729  if (i == (e - 1))
1730  opcode = NVPTXISD::LastCallArg;
1731  else
1732  opcode = NVPTXISD::CallArg;
1733  SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1734  SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1735  DAG.getConstant(i, dl, MVT::i32), InFlag };
1736  Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1737  InFlag = Chain.getValue(1);
1738  }
1739  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1740  SDValue CallArgEndOps[] = { Chain,
1741  DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1742  InFlag };
1743  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1744  InFlag = Chain.getValue(1);
1745 
1746  if (isIndirectCall) {
1747  SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1748  SDValue PrototypeOps[] = {
1749  Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
1750  Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1751  InFlag = Chain.getValue(1);
1752  }
1753 
1754  SmallVector<SDValue, 16> ProxyRegOps;
1755  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1756 
1757  // Generate loads from param memory/moves from registers for result
1758  if (Ins.size() > 0) {
1761  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1762  assert(VTs.size() == Ins.size() && "Bad value decomposition");
1763 
1764  Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1765  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1766 
1767  SmallVector<EVT, 6> LoadVTs;
1768  int VecIdx = -1; // Index of the first element of the vector.
1769 
1770  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1771  // 32-bits are sign extended or zero extended, depending on whether
1772  // they are signed or unsigned types.
1773  bool ExtendIntegerRetVal =
1774  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1775 
1776  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1777  bool needTruncate = false;
1778  EVT TheLoadType = VTs[i];
1779  EVT EltType = Ins[i].VT;
1780  Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
1781  if (ExtendIntegerRetVal) {
1782  TheLoadType = MVT::i32;
1783  EltType = MVT::i32;
1784  needTruncate = true;
1785  } else if (TheLoadType.getSizeInBits() < 16) {
1786  if (VTs[i].isInteger())
1787  needTruncate = true;
1788  EltType = MVT::i16;
1789  }
1790 
1791  // Record index of the very first element of the vector.
1792  if (VectorInfo[i] & PVF_FIRST) {
1793  assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1794  VecIdx = i;
1795  }
1796 
1797  LoadVTs.push_back(EltType);
1798 
1799  if (VectorInfo[i] & PVF_LAST) {
1800  unsigned NumElts = LoadVTs.size();
1801  LoadVTs.push_back(MVT::Other);
1802  LoadVTs.push_back(MVT::Glue);
1804  switch (NumElts) {
1805  case 1:
1807  break;
1808  case 2:
1810  break;
1811  case 4:
1813  break;
1814  default:
1815  llvm_unreachable("Invalid vector info.");
1816  }
1817 
1818  SDValue LoadOperands[] = {
1819  Chain, DAG.getConstant(1, dl, MVT::i32),
1820  DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1821  SDValue RetVal = DAG.getMemIntrinsicNode(
1822  Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1823  MachinePointerInfo(), EltAlign,
1825 
1826  for (unsigned j = 0; j < NumElts; ++j) {
1827  ProxyRegOps.push_back(RetVal.getValue(j));
1828 
1829  if (needTruncate)
1830  ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1831  else
1832  ProxyRegTruncates.push_back(Optional<MVT>());
1833  }
1834 
1835  Chain = RetVal.getValue(NumElts);
1836  InFlag = RetVal.getValue(NumElts + 1);
1837 
1838  // Cleanup
1839  VecIdx = -1;
1840  LoadVTs.clear();
1841  }
1842  }
1843  }
1844 
1845  Chain = DAG.getCALLSEQ_END(
1846  Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true),
1847  DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl);
1848  InFlag = Chain.getValue(1);
1849 
1850  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1851  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1852  // dangling.
1853  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1854  SDValue Ret = DAG.getNode(
1855  NVPTXISD::ProxyReg, dl,
1856  DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1857  { Chain, ProxyRegOps[i], InFlag }
1858  );
1859 
1860  Chain = Ret.getValue(1);
1861  InFlag = Ret.getValue(2);
1862 
1863  if (ProxyRegTruncates[i].hasValue()) {
1864  Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1865  }
1866 
1867  InVals.push_back(Ret);
1868  }
1869 
1870  // set isTailCall to false for now, until we figure out how to express
1871  // tail call optimization in PTX
1872  isTailCall = false;
1873  return Chain;
1874 }
1875 
1876 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1877 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1878 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1879 SDValue
1880 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1881  SDNode *Node = Op.getNode();
1882  SDLoc dl(Node);
1884  unsigned NumOperands = Node->getNumOperands();
1885  for (unsigned i = 0; i < NumOperands; ++i) {
1886  SDValue SubOp = Node->getOperand(i);
1887  EVT VVT = SubOp.getNode()->getValueType(0);
1888  EVT EltVT = VVT.getVectorElementType();
1889  unsigned NumSubElem = VVT.getVectorNumElements();
1890  for (unsigned j = 0; j < NumSubElem; ++j) {
1891  Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1892  DAG.getIntPtrConstant(j, dl)));
1893  }
1894  }
1895  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1896 }
1897 
1898 // We can init constant f16x2 with a single .b32 move. Normally it
1899 // would get lowered as two constant loads and vector-packing move.
1900 // mov.b16 %h1, 0x4000;
1901 // mov.b16 %h2, 0x3C00;
1902 // mov.b32 %hh2, {%h2, %h1};
1903 // Instead we want just a constant move:
1904 // mov.b32 %hh2, 0x40003C00
1905 //
1906 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1907 // generates good SASS in both cases.
1908 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1909  SelectionDAG &DAG) const {
1910  //return Op;
1911  if (!(Op->getValueType(0) == MVT::v2f16 &&
1912  isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1913  isa<ConstantFPSDNode>(Op->getOperand(1))))
1914  return Op;
1915 
1916  APInt E0 =
1917  cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1918  APInt E1 =
1919  cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1920  SDValue Const =
1921  DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1922  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1923 }
1924 
1925 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1926  SelectionDAG &DAG) const {
1927  SDValue Index = Op->getOperand(1);
1928  // Constant index will be matched by tablegen.
1929  if (isa<ConstantSDNode>(Index.getNode()))
1930  return Op;
1931 
1932  // Extract individual elements and select one of them.
1933  SDValue Vector = Op->getOperand(0);
1934  EVT VectorVT = Vector.getValueType();
1935  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1936  EVT EltVT = VectorVT.getVectorElementType();
1937 
1938  SDLoc dl(Op.getNode());
1939  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1940  DAG.getIntPtrConstant(0, dl));
1941  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1942  DAG.getIntPtrConstant(1, dl));
1943  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1945 }
1946 
1947 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1948 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1949 /// amount, or
1950 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1951 /// amount.
1952 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1953  SelectionDAG &DAG) const {
1954  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1955  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
1956 
1957  EVT VT = Op.getValueType();
1958  unsigned VTBits = VT.getSizeInBits();
1959  SDLoc dl(Op);
1960  SDValue ShOpLo = Op.getOperand(0);
1961  SDValue ShOpHi = Op.getOperand(1);
1962  SDValue ShAmt = Op.getOperand(2);
1963  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1964 
1965  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1966  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1967  // {dHi, dLo} = {aHi, aLo} >> Amt
1968  // dHi = aHi >> Amt
1969  // dLo = shf.r.clamp aLo, aHi, Amt
1970 
1971  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1972  SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1973  ShAmt);
1974 
1975  SDValue Ops[2] = { Lo, Hi };
1976  return DAG.getMergeValues(Ops, dl);
1977  }
1978  else {
1979  // {dHi, dLo} = {aHi, aLo} >> Amt
1980  // - if (Amt>=size) then
1981  // dLo = aHi >> (Amt-size)
1982  // dHi = aHi >> Amt (this is either all 0 or all 1)
1983  // else
1984  // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1985  // dHi = aHi >> Amt
1986 
1987  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1988  DAG.getConstant(VTBits, dl, MVT::i32),
1989  ShAmt);
1990  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1991  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1992  DAG.getConstant(VTBits, dl, MVT::i32));
1993  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1994  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1995  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
1996 
1997  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1998  DAG.getConstant(VTBits, dl, MVT::i32),
1999  ISD::SETGE);
2000  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2001  SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2002 
2003  SDValue Ops[2] = { Lo, Hi };
2004  return DAG.getMergeValues(Ops, dl);
2005  }
2006 }
2007 
2008 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2009 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2010 /// amount, or
2011 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2012 /// amount.
2013 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2014  SelectionDAG &DAG) const {
2015  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2016  assert(Op.getOpcode() == ISD::SHL_PARTS);
2017 
2018  EVT VT = Op.getValueType();
2019  unsigned VTBits = VT.getSizeInBits();
2020  SDLoc dl(Op);
2021  SDValue ShOpLo = Op.getOperand(0);
2022  SDValue ShOpHi = Op.getOperand(1);
2023  SDValue ShAmt = Op.getOperand(2);
2024 
2025  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2026  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2027  // {dHi, dLo} = {aHi, aLo} << Amt
2028  // dHi = shf.l.clamp aLo, aHi, Amt
2029  // dLo = aLo << Amt
2030 
2031  SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2032  ShAmt);
2033  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2034 
2035  SDValue Ops[2] = { Lo, Hi };
2036  return DAG.getMergeValues(Ops, dl);
2037  }
2038  else {
2039  // {dHi, dLo} = {aHi, aLo} << Amt
2040  // - if (Amt>=size) then
2041  // dLo = aLo << Amt (all 0)
2042  // dLo = aLo << (Amt-size)
2043  // else
2044  // dLo = aLo << Amt
2045  // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2046 
2047  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2048  DAG.getConstant(VTBits, dl, MVT::i32),
2049  ShAmt);
2050  SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2051  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2052  DAG.getConstant(VTBits, dl, MVT::i32));
2053  SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2054  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2055  SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2056 
2057  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2058  DAG.getConstant(VTBits, dl, MVT::i32),
2059  ISD::SETGE);
2060  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2061  SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2062 
2063  SDValue Ops[2] = { Lo, Hi };
2064  return DAG.getMergeValues(Ops, dl);
2065  }
2066 }
2067 
2068 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2069  EVT VT = Op.getValueType();
2070 
2071  if (VT == MVT::f32)
2072  return LowerFROUND32(Op, DAG);
2073 
2074  if (VT == MVT::f64)
2075  return LowerFROUND64(Op, DAG);
2076 
2077  llvm_unreachable("unhandled type");
2078 }
2079 
2080 // This is the the rounding method used in CUDA libdevice in C like code:
2081 // float roundf(float A)
2082 // {
2083 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2084 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2085 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2086 // }
2087 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2088  SelectionDAG &DAG) const {
2089  SDLoc SL(Op);
2090  SDValue A = Op.getOperand(0);
2091  EVT VT = Op.getValueType();
2092 
2093  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2094 
2095  // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2096  SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2097  const int SignBitMask = 0x80000000;
2098  SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2099  DAG.getConstant(SignBitMask, SL, MVT::i32));
2100  const int PointFiveInBits = 0x3F000000;
2101  SDValue PointFiveWithSignRaw =
2102  DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2103  DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2104  SDValue PointFiveWithSign =
2105  DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2106  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2107  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2108 
2109  // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2110  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2111  SDValue IsLarge =
2112  DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2113  ISD::SETOGT);
2114  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2115 
2116  // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2117  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2118  DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2119  SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2120  return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2121 }
2122 
2123 // The implementation of round(double) is similar to that of round(float) in
2124 // that they both separate the value range into three regions and use a method
2125 // specific to the region to round the values. However, round(double) first
2126 // calculates the round of the absolute value and then adds the sign back while
2127 // round(float) directly rounds the value with sign.
2128 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2129  SelectionDAG &DAG) const {
2130  SDLoc SL(Op);
2131  SDValue A = Op.getOperand(0);
2132  EVT VT = Op.getValueType();
2133 
2134  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2135 
2136  // double RoundedA = (double) (int) (abs(A) + 0.5f);
2137  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2138  DAG.getConstantFP(0.5, SL, VT));
2139  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2140 
2141  // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2142  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2143  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2144  DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2145  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2146  DAG.getConstantFP(0, SL, VT),
2147  RoundedA);
2148 
2149  // Add sign to rounded_A
2150  RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2151  DAG.getNode(ISD::FTRUNC, SL, VT, A);
2152 
2153  // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2154  SDValue IsLarge =
2155  DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2156  ISD::SETOGT);
2157  return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2158 }
2159 
2160 
2161 
2162 SDValue
2164  switch (Op.getOpcode()) {
2165  case ISD::RETURNADDR:
2166  return SDValue();
2167  case ISD::FRAMEADDR:
2168  return SDValue();
2169  case ISD::GlobalAddress:
2170  return LowerGlobalAddress(Op, DAG);
2172  return Op;
2173  case ISD::BUILD_VECTOR:
2174  return LowerBUILD_VECTOR(Op, DAG);
2176  return Op;
2178  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2179  case ISD::CONCAT_VECTORS:
2180  return LowerCONCAT_VECTORS(Op, DAG);
2181  case ISD::STORE:
2182  return LowerSTORE(Op, DAG);
2183  case ISD::LOAD:
2184  return LowerLOAD(Op, DAG);
2185  case ISD::SHL_PARTS:
2186  return LowerShiftLeftParts(Op, DAG);
2187  case ISD::SRA_PARTS:
2188  case ISD::SRL_PARTS:
2189  return LowerShiftRightParts(Op, DAG);
2190  case ISD::SELECT:
2191  return LowerSelect(Op, DAG);
2192  case ISD::FROUND:
2193  return LowerFROUND(Op, DAG);
2194  default:
2195  llvm_unreachable("Custom lowering not defined for operation");
2196  }
2197 }
2198 
2199 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2200  SDValue Op0 = Op->getOperand(0);
2201  SDValue Op1 = Op->getOperand(1);
2202  SDValue Op2 = Op->getOperand(2);
2203  SDLoc DL(Op.getNode());
2204 
2205  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2206 
2207  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2208  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2209  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2210  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2211 
2212  return Trunc;
2213 }
2214 
2215 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2216  if (Op.getValueType() == MVT::i1)
2217  return LowerLOADi1(Op, DAG);
2218 
2219  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2220  // loads and have to handle it here.
2221  if (Op.getValueType() == MVT::v2f16) {
2222  LoadSDNode *Load = cast<LoadSDNode>(Op);
2223  EVT MemVT = Load->getMemoryVT();
2225  MemVT, *Load->getMemOperand())) {
2226  SDValue Ops[2];
2227  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2228  return DAG.getMergeValues(Ops, SDLoc(Op));
2229  }
2230  }
2231 
2232  return SDValue();
2233 }
2234 
2235 // v = ld i1* addr
2236 // =>
2237 // v1 = ld i8* addr (-> i16)
2238 // v = trunc i16 to i1
2239 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2240  SDNode *Node = Op.getNode();
2241  LoadSDNode *LD = cast<LoadSDNode>(Node);
2242  SDLoc dl(Node);
2243  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2244  assert(Node->getValueType(0) == MVT::i1 &&
2245  "Custom lowering for i1 load only");
2246  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2247  LD->getPointerInfo(), LD->getAlignment(),
2248  LD->getMemOperand()->getFlags());
2249  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2250  // The legalizer (the caller) is expecting two values from the legalized
2251  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2252  // in LegalizeDAG.cpp which also uses MergeValues.
2253  SDValue Ops[] = { result, LD->getChain() };
2254  return DAG.getMergeValues(Ops, dl);
2255 }
2256 
2257 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2258  StoreSDNode *Store = cast<StoreSDNode>(Op);
2259  EVT VT = Store->getMemoryVT();
2260 
2261  if (VT == MVT::i1)
2262  return LowerSTOREi1(Op, DAG);
2263 
2264  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2265  // stores and have to handle it here.
2266  if (VT == MVT::v2f16 &&
2268  VT, *Store->getMemOperand()))
2269  return expandUnalignedStore(Store, DAG);
2270 
2271  if (VT.isVector())
2272  return LowerSTOREVector(Op, DAG);
2273 
2274  return SDValue();
2275 }
2276 
2277 SDValue
2278 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2279  SDNode *N = Op.getNode();
2280  SDValue Val = N->getOperand(1);
2281  SDLoc DL(N);
2282  EVT ValVT = Val.getValueType();
2283 
2284  if (ValVT.isVector()) {
2285  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2286  // legal. We can (and should) split that into 2 stores of <2 x double> here
2287  // but I'm leaving that as a TODO for now.
2288  if (!ValVT.isSimple())
2289  return SDValue();
2290  switch (ValVT.getSimpleVT().SimpleTy) {
2291  default:
2292  return SDValue();
2293  case MVT::v2i8:
2294  case MVT::v2i16:
2295  case MVT::v2i32:
2296  case MVT::v2i64:
2297  case MVT::v2f16:
2298  case MVT::v2f32:
2299  case MVT::v2f64:
2300  case MVT::v4i8:
2301  case MVT::v4i16:
2302  case MVT::v4i32:
2303  case MVT::v4f16:
2304  case MVT::v4f32:
2305  case MVT::v8f16: // <4 x f16x2>
2306  // This is a "native" vector type
2307  break;
2308  }
2309 
2310  MemSDNode *MemSD = cast<MemSDNode>(N);
2311  const DataLayout &TD = DAG.getDataLayout();
2312 
2313  Align Alignment = MemSD->getAlign();
2314  Align PrefAlign =
2315  TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2316  if (Alignment < PrefAlign) {
2317  // This store is not sufficiently aligned, so bail out and let this vector
2318  // store be scalarized. Note that we may still be able to emit smaller
2319  // vector stores. For example, if we are storing a <4 x float> with an
2320  // alignment of 8, this check will fail but the legalizer will try again
2321  // with 2 x <2 x float>, which will succeed with an alignment of 8.
2322  return SDValue();
2323  }
2324 
2325  unsigned Opcode = 0;
2326  EVT EltVT = ValVT.getVectorElementType();
2327  unsigned NumElts = ValVT.getVectorNumElements();
2328 
2329  // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2330  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2331  // stored type to i16 and propagate the "real" type as the memory type.
2332  bool NeedExt = false;
2333  if (EltVT.getSizeInBits() < 16)
2334  NeedExt = true;
2335 
2336  bool StoreF16x2 = false;
2337  switch (NumElts) {
2338  default:
2339  return SDValue();
2340  case 2:
2341  Opcode = NVPTXISD::StoreV2;
2342  break;
2343  case 4:
2344  Opcode = NVPTXISD::StoreV4;
2345  break;
2346  case 8:
2347  // v8f16 is a special case. PTX doesn't have st.v8.f16
2348  // instruction. Instead, we split the vector into v2f16 chunks and
2349  // store them with st.v4.b32.
2350  assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2351  Opcode = NVPTXISD::StoreV4;
2352  StoreF16x2 = true;
2353  break;
2354  }
2355 
2357 
2358  // First is the chain
2359  Ops.push_back(N->getOperand(0));
2360 
2361  if (StoreF16x2) {
2362  // Combine f16,f16 -> v2f16
2363  NumElts /= 2;
2364  for (unsigned i = 0; i < NumElts; ++i) {
2366  DAG.getIntPtrConstant(i * 2, DL));
2368  DAG.getIntPtrConstant(i * 2 + 1, DL));
2369  SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2370  Ops.push_back(V2);
2371  }
2372  } else {
2373  // Then the split values
2374  for (unsigned i = 0; i < NumElts; ++i) {
2375  SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2376  DAG.getIntPtrConstant(i, DL));
2377  if (NeedExt)
2378  ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2379  Ops.push_back(ExtVal);
2380  }
2381  }
2382 
2383  // Then any remaining arguments
2384  Ops.append(N->op_begin() + 2, N->op_end());
2385 
2386  SDValue NewSt =
2387  DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2388  MemSD->getMemoryVT(), MemSD->getMemOperand());
2389 
2390  // return DCI.CombineTo(N, NewSt, true);
2391  return NewSt;
2392  }
2393 
2394  return SDValue();
2395 }
2396 
2397 // st i1 v, addr
2398 // =>
2399 // v1 = zxt v to i16
2400 // st.u8 i16, addr
2401 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2402  SDNode *Node = Op.getNode();
2403  SDLoc dl(Node);
2404  StoreSDNode *ST = cast<StoreSDNode>(Node);
2405  SDValue Tmp1 = ST->getChain();
2406  SDValue Tmp2 = ST->getBasePtr();
2407  SDValue Tmp3 = ST->getValue();
2408  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2409  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2410  SDValue Result =
2411  DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2412  ST->getAlignment(), ST->getMemOperand()->getFlags());
2413  return Result;
2414 }
2415 
2416 SDValue
2417 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2418  std::string ParamSym;
2419  raw_string_ostream ParamStr(ParamSym);
2420 
2421  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2422  ParamStr.flush();
2423 
2424  std::string *SavedStr =
2425  nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2426  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2427 }
2428 
2430  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2431  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2432  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2433  MachineFunction &MF = DAG.getMachineFunction();
2434  const DataLayout &DL = DAG.getDataLayout();
2435  auto PtrVT = getPointerTy(DAG.getDataLayout());
2436 
2437  const Function *F = &MF.getFunction();
2438  const AttributeList &PAL = F->getAttributes();
2439  const TargetLowering *TLI = STI.getTargetLowering();
2440 
2441  SDValue Root = DAG.getRoot();
2442  std::vector<SDValue> OutChains;
2443 
2444  bool isABI = (STI.getSmVersion() >= 20);
2445  assert(isABI && "Non-ABI compilation is not supported");
2446  if (!isABI)
2447  return Chain;
2448 
2449  std::vector<Type *> argTypes;
2450  std::vector<const Argument *> theArgs;
2451  for (const Argument &I : F->args()) {
2452  theArgs.push_back(&I);
2453  argTypes.push_back(I.getType());
2454  }
2455  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2456  // Ins.size() will be larger
2457  // * if there is an aggregate argument with multiple fields (each field
2458  // showing up separately in Ins)
2459  // * if there is a vector argument with more than typical vector-length
2460  // elements (generally if more than 4) where each vector element is
2461  // individually present in Ins.
2462  // So a different index should be used for indexing into Ins.
2463  // See similar issue in LowerCall.
2464  unsigned InsIdx = 0;
2465 
2466  int idx = 0;
2467  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2468  Type *Ty = argTypes[i];
2469 
2470  if (theArgs[i]->use_empty()) {
2471  // argument is dead
2472  if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2473  SmallVector<EVT, 16> vtparts;
2474 
2475  ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2476  assert(vtparts.size() > 0 && "empty aggregate type not expected");
2477  for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2478  ++parti) {
2479  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2480  ++InsIdx;
2481  }
2482  if (vtparts.size() > 0)
2483  --InsIdx;
2484  continue;
2485  }
2486  if (Ty->isVectorTy()) {
2487  EVT ObjectVT = getValueType(DL, Ty);
2488  unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2489  for (unsigned parti = 0; parti < NumRegs; ++parti) {
2490  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2491  ++InsIdx;
2492  }
2493  if (NumRegs > 0)
2494  --InsIdx;
2495  continue;
2496  }
2497  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2498  continue;
2499  }
2500 
2501  // In the following cases, assign a node order of "idx+1"
2502  // to newly created nodes. The SDNodes for params have to
2503  // appear in the same order as their order of appearance
2504  // in the original function. "idx+1" holds that order.
2505  if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
2506  bool aggregateIsPacked = false;
2507  if (StructType *STy = dyn_cast<StructType>(Ty))
2508  aggregateIsPacked = STy->isPacked();
2509 
2512  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2513  assert(VTs.size() > 0 && "Unexpected empty type.");
2514  auto VectorInfo =
2515  VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
2516 
2517  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2518  int VecIdx = -1; // Index of the first element of the current vector.
2519  for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2520  if (VectorInfo[parti] & PVF_FIRST) {
2521  assert(VecIdx == -1 && "Orphaned vector.");
2522  VecIdx = parti;
2523  }
2524 
2525  // That's the last element of this store op.
2526  if (VectorInfo[parti] & PVF_LAST) {
2527  unsigned NumElts = parti - VecIdx + 1;
2528  EVT EltVT = VTs[parti];
2529  // i1 is loaded/stored as i8.
2530  EVT LoadVT = EltVT;
2531  if (EltVT == MVT::i1)
2532  LoadVT = MVT::i8;
2533  else if (EltVT == MVT::v2f16)
2534  // getLoad needs a vector type, but it can't handle
2535  // vectors which contain v2f16 elements. So we must load
2536  // using i32 here and then bitcast back.
2537  LoadVT = MVT::i32;
2538 
2539  EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2540  SDValue VecAddr =
2541  DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2542  DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2544  EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2545  SDValue P =
2546  DAG.getLoad(VecVT, dl, Root, VecAddr,
2547  MachinePointerInfo(srcValue), aggregateIsPacked,
2550  if (P.getNode())
2551  P.getNode()->setIROrder(idx + 1);
2552  for (unsigned j = 0; j < NumElts; ++j) {
2553  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2554  DAG.getIntPtrConstant(j, dl));
2555  // We've loaded i1 as an i8 and now must truncate it back to i1
2556  if (EltVT == MVT::i1)
2557  Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2558  // v2f16 was loaded as an i32. Now we must bitcast it back.
2559  else if (EltVT == MVT::v2f16)
2560  Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2561  // Extend the element if necessary (e.g. an i8 is loaded
2562  // into an i16 register)
2563  if (Ins[InsIdx].VT.isInteger() &&
2564  Ins[InsIdx].VT.getFixedSizeInBits() >
2565  LoadVT.getFixedSizeInBits()) {
2566  unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2567  : ISD::ZERO_EXTEND;
2568  Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2569  }
2570  InVals.push_back(Elt);
2571  }
2572 
2573  // Reset vector tracking state.
2574  VecIdx = -1;
2575  }
2576  ++InsIdx;
2577  }
2578  if (VTs.size() > 0)
2579  --InsIdx;
2580  continue;
2581  }
2582 
2583  // Param has ByVal attribute
2584  // Return MoveParam(param symbol).
2585  // Ideally, the param symbol can be returned directly,
2586  // but when SDNode builder decides to use it in a CopyToReg(),
2587  // machine instruction fails because TargetExternalSymbol
2588  // (not lowered) is target dependent, and CopyToReg assumes
2589  // the source is lowered.
2590  EVT ObjectVT = getValueType(DL, Ty);
2591  assert(ObjectVT == Ins[InsIdx].VT &&
2592  "Ins type did not match function type");
2593  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2594  SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2595  if (p.getNode())
2596  p.getNode()->setIROrder(idx + 1);
2597  InVals.push_back(p);
2598  }
2599 
2600  // Clang will check explicit VarArg and issue error if any. However, Clang
2601  // will let code with
2602  // implicit var arg like f() pass. See bug 617733.
2603  // We treat this case as if the arg list is empty.
2604  // if (F.isVarArg()) {
2605  // assert(0 && "VarArg not supported yet!");
2606  //}
2607 
2608  if (!OutChains.empty())
2609  DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2610 
2611  return Chain;
2612 }
2613 
2614 SDValue
2616  bool isVarArg,
2617  const SmallVectorImpl<ISD::OutputArg> &Outs,
2618  const SmallVectorImpl<SDValue> &OutVals,
2619  const SDLoc &dl, SelectionDAG &DAG) const {
2620  const MachineFunction &MF = DAG.getMachineFunction();
2621  const Function &F = MF.getFunction();
2622  Type *RetTy = MF.getFunction().getReturnType();
2623 
2624  bool isABI = (STI.getSmVersion() >= 20);
2625  assert(isABI && "Non-ABI compilation is not supported");
2626  if (!isABI)
2627  return Chain;
2628 
2629  const DataLayout &DL = DAG.getDataLayout();
2632  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2633  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2634 
2635  auto VectorInfo = VectorizePTXValueVTs(
2636  VTs, Offsets,
2637  RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
2638  : Align(1));
2639 
2640  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2641  // 32-bits are sign extended or zero extended, depending on whether
2642  // they are signed or unsigned types.
2643  bool ExtendIntegerRetVal =
2644  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2645 
2646  SmallVector<SDValue, 6> StoreOperands;
2647  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2648  // New load/store. Record chain and offset operands.
2649  if (VectorInfo[i] & PVF_FIRST) {
2650  assert(StoreOperands.empty() && "Orphaned operand list.");
2651  StoreOperands.push_back(Chain);
2652  StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2653  }
2654 
2655  SDValue RetVal = OutVals[i];
2656  if (ExtendIntegerRetVal) {
2657  RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2658  : ISD::ZERO_EXTEND,
2659  dl, MVT::i32, RetVal);
2660  } else if (RetVal.getValueSizeInBits() < 16) {
2661  // Use 16-bit registers for small load-stores as it's the
2662  // smallest general purpose register size supported by NVPTX.
2663  RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2664  }
2665 
2666  // Record the value to return.
2667  StoreOperands.push_back(RetVal);
2668 
2669  // That's the last element of this store op.
2670  if (VectorInfo[i] & PVF_LAST) {
2672  unsigned NumElts = StoreOperands.size() - 2;
2673  switch (NumElts) {
2674  case 1:
2676  break;
2677  case 2:
2679  break;
2680  case 4:
2682  break;
2683  default:
2684  llvm_unreachable("Invalid vector info.");
2685  }
2686 
2687  // Adjust type of load/store op if we've extended the scalar
2688  // return value.
2689  EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2690  Chain = DAG.getMemIntrinsicNode(
2691  Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
2693  // Cleanup vector state.
2694  StoreOperands.clear();
2695  }
2696  }
2697 
2698  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2699 }
2700 
2702  SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2703  SelectionDAG &DAG) const {
2704  if (Constraint.length() > 1)
2705  return;
2706  else
2707  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2708 }
2709 
2710 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2711  switch (Intrinsic) {
2712  default:
2713  return 0;
2714 
2715  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2716  return NVPTXISD::Tex1DFloatS32;
2717  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2719  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2721  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2723  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2724  return NVPTXISD::Tex1DS32S32;
2725  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2726  return NVPTXISD::Tex1DS32Float;
2727  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2729  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2731  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2732  return NVPTXISD::Tex1DU32S32;
2733  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2734  return NVPTXISD::Tex1DU32Float;
2735  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2737  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2739 
2740  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2742  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2744  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2746  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2748  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2750  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2752  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2754  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2756  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2758  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2760  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2762  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2764 
2765  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2766  return NVPTXISD::Tex2DFloatS32;
2767  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2769  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2771  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2773  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2774  return NVPTXISD::Tex2DS32S32;
2775  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2776  return NVPTXISD::Tex2DS32Float;
2777  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2779  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2781  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2782  return NVPTXISD::Tex2DU32S32;
2783  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2784  return NVPTXISD::Tex2DU32Float;
2785  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2787  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2789 
2790  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2792  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2794  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2796  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2798  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2800  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2802  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2804  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2806  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2808  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2810  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2812  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2814 
2815  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2816  return NVPTXISD::Tex3DFloatS32;
2817  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2819  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2821  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2823  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2824  return NVPTXISD::Tex3DS32S32;
2825  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2826  return NVPTXISD::Tex3DS32Float;
2827  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2829  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2831  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2832  return NVPTXISD::Tex3DU32S32;
2833  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2834  return NVPTXISD::Tex3DU32Float;
2835  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2837  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2839 
2840  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2842  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2844  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2846  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2848  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2850  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2852 
2853  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2855  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2857  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2859  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2861  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2863  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2865 
2866  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2868  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2870  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2872  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2874  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2876  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2878  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2880  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2882  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2884  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2886  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2888  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2890 
2891  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2893  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2895  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2897  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2899  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2901  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2903  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2905  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2907  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2909  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2911  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2913  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2915 
2916  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2918  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2920  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2922  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2924  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2926  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2928  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2930  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2932  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2934  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2936  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2938  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2940 
2941  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2943  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2945  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2947  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2949  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2951  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2953  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2955  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2957  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2959  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2961  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2963  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2965 
2966  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2968  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2970  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2972  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2974  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
2976  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
2978  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
2980  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
2982  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
2984  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
2986  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
2988  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
2990 
2991  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
2993  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
2995  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
2997  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
2999  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3001  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3003  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3005  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3007  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3009  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3011  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3013  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3015 
3016  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3018  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3020  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3022  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3024  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3026  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3028 
3029  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3031  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3033  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3035  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3037  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3039  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3041 
3042  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3044  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3046  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3048  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3050  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3052  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3054  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3056  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3058  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3060  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3062  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3064  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3066  }
3067 }
3068 
3069 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3070  switch (Intrinsic) {
3071  default:
3072  return 0;
3073  case Intrinsic::nvvm_suld_1d_i8_clamp:
3074  return NVPTXISD::Suld1DI8Clamp;
3075  case Intrinsic::nvvm_suld_1d_i16_clamp:
3076  return NVPTXISD::Suld1DI16Clamp;
3077  case Intrinsic::nvvm_suld_1d_i32_clamp:
3078  return NVPTXISD::Suld1DI32Clamp;
3079  case Intrinsic::nvvm_suld_1d_i64_clamp:
3080  return NVPTXISD::Suld1DI64Clamp;
3081  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3083  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3085  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3087  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3089  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3091  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3093  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3095  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3097  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3099  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3101  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3103  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3105  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3107  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3109  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3111  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3113  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3115  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3117  case Intrinsic::nvvm_suld_2d_i8_clamp:
3118  return NVPTXISD::Suld2DI8Clamp;
3119  case Intrinsic::nvvm_suld_2d_i16_clamp:
3120  return NVPTXISD::Suld2DI16Clamp;
3121  case Intrinsic::nvvm_suld_2d_i32_clamp:
3122  return NVPTXISD::Suld2DI32Clamp;
3123  case Intrinsic::nvvm_suld_2d_i64_clamp:
3124  return NVPTXISD::Suld2DI64Clamp;
3125  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3127  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3129  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3131  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3133  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3135  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3137  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3139  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3141  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3143  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3145  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3147  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3149  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3151  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3153  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3155  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3157  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3159  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3161  case Intrinsic::nvvm_suld_3d_i8_clamp:
3162  return NVPTXISD::Suld3DI8Clamp;
3163  case Intrinsic::nvvm_suld_3d_i16_clamp:
3164  return NVPTXISD::Suld3DI16Clamp;
3165  case Intrinsic::nvvm_suld_3d_i32_clamp:
3166  return NVPTXISD::Suld3DI32Clamp;
3167  case Intrinsic::nvvm_suld_3d_i64_clamp:
3168  return NVPTXISD::Suld3DI64Clamp;
3169  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3171  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3173  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3175  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3177  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3179  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3181  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3183  case Intrinsic::nvvm_suld_1d_i8_trap:
3184  return NVPTXISD::Suld1DI8Trap;
3185  case Intrinsic::nvvm_suld_1d_i16_trap:
3186  return NVPTXISD::Suld1DI16Trap;
3187  case Intrinsic::nvvm_suld_1d_i32_trap:
3188  return NVPTXISD::Suld1DI32Trap;
3189  case Intrinsic::nvvm_suld_1d_i64_trap:
3190  return NVPTXISD::Suld1DI64Trap;
3191  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3192  return NVPTXISD::Suld1DV2I8Trap;
3193  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3195  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3197  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3199  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3200  return NVPTXISD::Suld1DV4I8Trap;
3201  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3203  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3205  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3207  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3209  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3211  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3213  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3215  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3217  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3219  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3221  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3223  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3225  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3227  case Intrinsic::nvvm_suld_2d_i8_trap:
3228  return NVPTXISD::Suld2DI8Trap;
3229  case Intrinsic::nvvm_suld_2d_i16_trap:
3230  return NVPTXISD::Suld2DI16Trap;
3231  case Intrinsic::nvvm_suld_2d_i32_trap:
3232  return NVPTXISD::Suld2DI32Trap;
3233  case Intrinsic::nvvm_suld_2d_i64_trap:
3234  return NVPTXISD::Suld2DI64Trap;
3235  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3236  return NVPTXISD::Suld2DV2I8Trap;
3237  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3239  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3241  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3243  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3244  return NVPTXISD::Suld2DV4I8Trap;
3245  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3247  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3249  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3251  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3253  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3255  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3257  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3259  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3261  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3263  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3265  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3267  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3269  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3271  case Intrinsic::nvvm_suld_3d_i8_trap:
3272  return NVPTXISD::Suld3DI8Trap;
3273  case Intrinsic::nvvm_suld_3d_i16_trap:
3274  return NVPTXISD::Suld3DI16Trap;
3275  case Intrinsic::nvvm_suld_3d_i32_trap:
3276  return NVPTXISD::Suld3DI32Trap;
3277  case Intrinsic::nvvm_suld_3d_i64_trap:
3278  return NVPTXISD::Suld3DI64Trap;
3279  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3280  return NVPTXISD::Suld3DV2I8Trap;
3281  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3283  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3285  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3287  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3288  return NVPTXISD::Suld3DV4I8Trap;
3289  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3291  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3293  case Intrinsic::nvvm_suld_1d_i8_zero:
3294  return NVPTXISD::Suld1DI8Zero;
3295  case Intrinsic::nvvm_suld_1d_i16_zero:
3296  return NVPTXISD::Suld1DI16Zero;
3297  case Intrinsic::nvvm_suld_1d_i32_zero:
3298  return NVPTXISD::Suld1DI32Zero;
3299  case Intrinsic::nvvm_suld_1d_i64_zero:
3300  return NVPTXISD::Suld1DI64Zero;
3301  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3302  return NVPTXISD::Suld1DV2I8Zero;
3303  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3305  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3307  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3309  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3310  return NVPTXISD::Suld1DV4I8Zero;
3311  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3313  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3315  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3317  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3319  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3321  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3323  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3325  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3327  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3329  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3331  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3333  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3335  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3337  case Intrinsic::nvvm_suld_2d_i8_zero:
3338  return NVPTXISD::Suld2DI8Zero;
3339  case Intrinsic::nvvm_suld_2d_i16_zero:
3340  return NVPTXISD::Suld2DI16Zero;
3341  case Intrinsic::nvvm_suld_2d_i32_zero:
3342  return NVPTXISD::Suld2DI32Zero;
3343  case Intrinsic::nvvm_suld_2d_i64_zero:
3344  return NVPTXISD::Suld2DI64Zero;
3345  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3346  return NVPTXISD::Suld2DV2I8Zero;
3347  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3349  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3351  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3353  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3354  return NVPTXISD::Suld2DV4I8Zero;
3355  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3357  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3359  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3361  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3363  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3365  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3367  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3369  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3371  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3373  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3375  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3377  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3379  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3381  case Intrinsic::nvvm_suld_3d_i8_zero:
3382  return NVPTXISD::Suld3DI8Zero;
3383  case Intrinsic::nvvm_suld_3d_i16_zero:
3384  return NVPTXISD::Suld3DI16Zero;
3385  case Intrinsic::nvvm_suld_3d_i32_zero:
3386  return NVPTXISD::Suld3DI32Zero;
3387  case Intrinsic::nvvm_suld_3d_i64_zero:
3388  return NVPTXISD::Suld3DI64Zero;
3389  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3390  return NVPTXISD::Suld3DV2I8Zero;
3391  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3393  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3395  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3397  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3398  return NVPTXISD::Suld3DV4I8Zero;
3399  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3401  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3403  }
3404 }
3405 
3406 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3407 // TgtMemIntrinsic
3408 // because we need the information that is only available in the "Value" type
3409 // of destination
3410 // pointer. In particular, the address space information.
3412  IntrinsicInfo &Info, const CallInst &I,
3413  MachineFunction &MF, unsigned Intrinsic) const {
3414  switch (Intrinsic) {
3415  default:
3416  return false;
3417  case Intrinsic::nvvm_match_all_sync_i32p:
3418  case Intrinsic::nvvm_match_all_sync_i64p:
3420  // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3421  // in order to model data exchange with other threads, but perform no real
3422  // memory accesses.
3423  Info.memVT = MVT::i1;
3424 
3425  // Our result depends on both our and other thread's arguments.
3427  return true;
3428  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3429  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3430  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3431  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3432  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3433  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3434  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3435  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3436  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3437  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3438  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3439  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3440  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3441  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3442  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3443  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3444  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3445  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3446  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3447  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3448  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3449  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3450  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3451  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3453  Info.memVT = MVT::v8f16;
3454  Info.ptrVal = I.getArgOperand(0);
3455  Info.offset = 0;
3457  Info.align = Align(16);
3458  return true;
3459  }
3460  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3461  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3462  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3463  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3464  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3465  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3466  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3467  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3468  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3469  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3470  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3471  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3472  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3473  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3474  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3475  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3476  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3477  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3478  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3479  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3480  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3481  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3482  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3483  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3485  Info.memVT = MVT::v2i32;
3486  Info.ptrVal = I.getArgOperand(0);
3487  Info.offset = 0;
3489  Info.align = Align(8);
3490  return true;
3491  }
3492 
3493  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3494  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3495  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3496  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3497  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3498  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3499  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3500  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3501  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3502  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3503  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3504  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3505  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3506  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3507  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3508  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3509 
3510  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3511  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3512  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3513  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3514  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3515  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3516  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3517  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3518  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3519  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3520  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3521  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3522  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3523  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3524  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3525  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3526  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3527  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3529  Info.memVT = MVT::v4i32;
3530  Info.ptrVal = I.getArgOperand(0);
3531  Info.offset = 0;
3533  Info.align = Align(16);
3534  return true;
3535  }
3536 
3537  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3538  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3539  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3540  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3541  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3542  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3543  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3544  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3545 
3546  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3547  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3548  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3549  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3550  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3551  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3552  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3553  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3554  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3555  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3556  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3557  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3558  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3559  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3560  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3561  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3562  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3563  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3564  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3565  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3566  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3567  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3569  Info.memVT = MVT::i32;
3570  Info.ptrVal = I.getArgOperand(0);
3571  Info.offset = 0;
3573  Info.align = Align(4);
3574  return true;
3575  }
3576 
3577  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3578  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3579  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3580  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3581  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3582  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3583  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3584  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3585  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3586  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3587  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3588  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3590  Info.memVT = MVT::v4f16;
3591  Info.ptrVal = I.getArgOperand(0);
3592  Info.offset = 0;
3594  Info.align = Align(16);
3595  return true;
3596  }
3597 
3598  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3599  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3600  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3601  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3602  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3603  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3604  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3605  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3606  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3607  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3608  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3609  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3610  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3611  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3612  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3613  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3615  Info.memVT = MVT::v8f32;
3616  Info.ptrVal = I.getArgOperand(0);
3617  Info.offset = 0;
3619  Info.align = Align(16);
3620  return true;
3621  }
3622 
3623  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3624  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3625  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3626  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3627 
3628  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3629  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3630  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3631  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3632 
3633  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3634  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3635  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3636  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3637  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3638  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3639  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3640  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3641  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3642  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3643  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3644  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3646  Info.memVT = MVT::v8i32;
3647  Info.ptrVal = I.getArgOperand(0);
3648  Info.offset = 0;
3650  Info.align = Align(16);
3651  return true;
3652  }
3653 
3654  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3655  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3656  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3657  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3658  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3659  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3660  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3661  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3662  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3663  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
3665  Info.memVT = MVT::v2i32;
3666  Info.ptrVal = I.getArgOperand(0);
3667  Info.offset = 0;
3669  Info.align = Align(8);
3670  return true;
3671  }
3672 
3673  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3674  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3675  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3676  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3677 
3678  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3679  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3680  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3681  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3683  Info.memVT = MVT::f64;
3684  Info.ptrVal = I.getArgOperand(0);
3685  Info.offset = 0;
3687  Info.align = Align(8);
3688  return true;
3689  }
3690 
3691  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3692  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3693  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3694  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3696  Info.memVT = MVT::v2f64;
3697  Info.ptrVal = I.getArgOperand(0);
3698  Info.offset = 0;
3700  Info.align = Align(16);
3701  return true;
3702  }
3703 
3704  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3705  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3706  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3707  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3708  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3709  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3710  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3711  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3712  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3713  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3714  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3715  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3716  Info.opc = ISD::INTRINSIC_VOID;
3717  Info.memVT = MVT::v4f16;
3718  Info.ptrVal = I.getArgOperand(0);
3719  Info.offset = 0;
3721  Info.align = Align(16);
3722  return true;
3723  }
3724 
3725  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3726  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3727  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3728  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3729  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3730  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3731  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3732  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3733  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3734  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3735  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3736  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3737  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3738  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3739  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3740  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3741  Info.opc = ISD::INTRINSIC_VOID;
3742  Info.memVT = MVT::v8f32;
3743  Info.ptrVal = I.getArgOperand(0);
3744  Info.offset = 0;
3746  Info.align = Align(16);
3747  return true;
3748  }
3749 
3750  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3751  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3752  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3753  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3754  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3755  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3756  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3757  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3758  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3759  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3760  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3761  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3762  Info.opc = ISD::INTRINSIC_VOID;
3763  Info.memVT = MVT::v8i32;
3764  Info.ptrVal = I.getArgOperand(0);
3765  Info.offset = 0;
3767  Info.align = Align(16);
3768  return true;
3769  }
3770 
3771  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3772  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3773  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3774  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3775  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3776  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3777  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3778  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3779  Info.opc = ISD::INTRINSIC_VOID;
3780  Info.memVT = MVT::v2i32;
3781  Info.ptrVal = I.getArgOperand(0);
3782  Info.offset = 0;
3784  Info.align = Align(8);
3785  return true;
3786  }
3787 
3788  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
3789  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
3790  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
3791  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
3792  Info.opc = ISD::INTRINSIC_VOID;
3793  Info.memVT = MVT::v2f64;
3794  Info.ptrVal = I.getArgOperand(0);
3795  Info.offset = 0;
3797  Info.align = Align(16);
3798  return true;
3799  }
3800 
3801  case Intrinsic::nvvm_atomic_load_inc_32:
3802  case Intrinsic::nvvm_atomic_load_dec_32:
3803 
3804  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3805  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3806  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3807  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3808  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3809  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3810  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3811  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3812  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3813  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3814  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3815  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3816  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3817  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3818  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3819  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3820  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3821  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3822  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3823  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3824  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3825  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3826  auto &DL = I.getModule()->getDataLayout();
3828  Info.memVT = getValueType(DL, I.getType());
3829  Info.ptrVal = I.getArgOperand(0);
3830  Info.offset = 0;
3832  Info.align.reset();
3833  return true;
3834  }
3835 
3836  case Intrinsic::nvvm_ldu_global_i:
3837  case Intrinsic::nvvm_ldu_global_f:
3838  case Intrinsic::nvvm_ldu_global_p: {
3839  auto &DL = I.getModule()->getDataLayout();
3841  if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3842  Info.memVT = getValueType(DL, I.getType());
3843  else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3844  Info.memVT = getPointerTy(DL);
3845  else
3846  Info.memVT = getValueType(DL, I.getType());
3847  Info.ptrVal = I.getArgOperand(0);
3848  Info.offset = 0;
3850  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
3851 
3852  return true;
3853  }
3854  case Intrinsic::nvvm_ldg_global_i:
3855  case Intrinsic::nvvm_ldg_global_f:
3856  case Intrinsic::nvvm_ldg_global_p: {
3857  auto &DL = I.getModule()->getDataLayout();
3858 
3860  if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3861  Info.memVT = getValueType(DL, I.getType());
3862  else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3863  Info.memVT = getPointerTy(DL);
3864  else
3865  Info.memVT = getValueType(DL, I.getType());
3866  Info.ptrVal = I.getArgOperand(0);
3867  Info.offset = 0;
3869  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
3870 
3871  return true;
3872  }
3873 
3874  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3875  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3876  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3877  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3878  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3879  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3880  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3881  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3882  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3883  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3884  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3885  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3886  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3887  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3888  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3889  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3890  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3891  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3892  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3893  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3894  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3895  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3896  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3897  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3898  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3899  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3900  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3901  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3902  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3903  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3904  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3905  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3906  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3907  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3908  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3909  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3910  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3911  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3912  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3913  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3914  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3915  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3916  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3917  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3918  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3919  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3920  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3921  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3922  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3923  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3924  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3925  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3926  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3927  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3928  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3929  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3930  Info.opc = getOpcForTextureInstr(Intrinsic);
3931  Info.memVT = MVT::v4f32;
3932  Info.ptrVal = nullptr;
3933  Info.offset = 0;
3935  Info.align = Align(16);
3936  return true;
3937 
3938  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3939  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3940  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3941  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3942  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3943  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3944  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3945  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3946  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3947  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3948  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3949  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3950  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3951  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3952  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3953  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3954  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3955  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3956  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3957  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3958  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3959  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3960  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3961  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3962  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3963  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3964  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3965  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3966  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3967  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3968  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3969  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3970  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3971  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3972  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3973  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3974  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3975  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3976  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3977  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3978  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3979  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3980  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3981  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3982  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3983  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3984  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3985  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3986  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3987  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3988  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3989  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3990  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3991  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3992  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3993  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3994  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3995  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3996  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3997  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3998  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3999  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4000  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4001  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4002  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4003  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4004  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4005  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4006  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4007  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4008  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4009  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4010  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4011  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4012  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4013  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4014  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4015  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4016  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4017  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4018  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4019  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4020  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4021  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4022  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4023  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4024  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4025  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4026  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4027  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4028  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4029  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4030  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4031  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4032  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4033  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4034  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4035  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4036  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4037  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4038  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4039  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4040  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4041  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4042  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4043  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4044  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4045  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4046  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4047  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4048  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4049  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4050  Info.opc = getOpcForTextureInstr(Intrinsic);
4051  Info.memVT = MVT::v4i32;
4052  Info.ptrVal = nullptr;
4053  Info.offset = 0;
4055  Info.align = Align(16);
4056  return true;
4057 
4058  case Intrinsic::nvvm_suld_1d_i8_clamp:
4059  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4060  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4061  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4062  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4063  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4064  case Intrinsic::nvvm_suld_2d_i8_clamp:
4065  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4066  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4067  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4068  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4069  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4070  case Intrinsic::nvvm_suld_3d_i8_clamp:
4071  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4072  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4073  case Intrinsic::nvvm_suld_1d_i8_trap:
4074  case Intrinsic::nvvm_suld_1d_v2i8_trap:
4075  case Intrinsic::nvvm_suld_1d_v4i8_trap:
4076  case Intrinsic::nvvm_suld_1d_array_i8_trap:
4077  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4078  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4079  case Intrinsic::nvvm_suld_2d_i8_trap:
4080  case Intrinsic::nvvm_suld_2d_v2i8_trap:
4081  case Intrinsic::nvvm_suld_2d_v4i8_trap:
4082  case Intrinsic::nvvm_suld_2d_array_i8_trap:
4083  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4084  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4085  case Intrinsic::nvvm_suld_3d_i8_trap:
4086  case Intrinsic::nvvm_suld_3d_v2i8_trap:
4087  case Intrinsic::nvvm_suld_3d_v4i8_trap:
4088  case Intrinsic::nvvm_suld_1d_i8_zero:
4089  case Intrinsic::nvvm_suld_1d_v2i8_zero:
4090  case Intrinsic::nvvm_suld_1d_v4i8_zero:
4091  case Intrinsic::nvvm_suld_1d_array_i8_zero:
4092  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4093  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4094  case Intrinsic::nvvm_suld_2d_i8_zero:
4095  case Intrinsic::nvvm_suld_2d_v2i8_zero:
4096  case Intrinsic::nvvm_suld_2d_v4i8_zero:
4097  case Intrinsic::nvvm_suld_2d_array_i8_zero:
4098  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4099  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4100  case Intrinsic::nvvm_suld_3d_i8_zero:
4101  case Intrinsic::nvvm_suld_3d_v2i8_zero:
4102  case Intrinsic::nvvm_suld_3d_v4i8_zero:
4103  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4104  Info.memVT = MVT::i8;
4105  Info.ptrVal = nullptr;
4106  Info.offset = 0;
4108  Info.align = Align(16);
4109  return true;
4110 
4111  case Intrinsic::nvvm_suld_1d_i16_clamp:
4112  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4113  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4114  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4115  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4116  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4117  case Intrinsic::nvvm_suld_2d_i16_clamp:
4118  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4119  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4120  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4121  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4122  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4123  case Intrinsic::nvvm_suld_3d_i16_clamp:
4124  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4125  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4126  case Intrinsic::nvvm_suld_1d_i16_trap:
4127  case Intrinsic::nvvm_suld_1d_v2i16_trap:
4128  case Intrinsic::nvvm_suld_1d_v4i16_trap:
4129  case Intrinsic::nvvm_suld_1d_array_i16_trap:
4130  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4131  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4132  case Intrinsic::nvvm_suld_2d_i16_trap:
4133  case Intrinsic::nvvm_suld_2d_v2i16_trap:
4134  case Intrinsic::nvvm_suld_2d_v4i16_trap:
4135  case Intrinsic::nvvm_suld_2d_array_i16_trap:
4136  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4137  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4138  case Intrinsic::nvvm_suld_3d_i16_trap:
4139  case Intrinsic::nvvm_suld_3d_v2i16_trap:
4140  case Intrinsic::nvvm_suld_3d_v4i16_trap:
4141  case Intrinsic::nvvm_suld_1d_i16_zero:
4142  case Intrinsic::nvvm_suld_1d_v2i16_zero:
4143  case Intrinsic::nvvm_suld_1d_v4i16_zero:
4144  case Intrinsic::nvvm_suld_1d_array_i16_zero:
4145  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4146  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4147  case Intrinsic::nvvm_suld_2d_i16_zero:
4148  case Intrinsic::nvvm_suld_2d_v2i16_zero:
4149  case Intrinsic::nvvm_suld_2d_v4i16_zero:
4150  case Intrinsic::nvvm_suld_2d_array_i16_zero:
4151  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4152  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4153  case Intrinsic::nvvm_suld_3d_i16_zero:
4154  case Intrinsic::nvvm_suld_3d_v2i16_zero:
4155  case Intrinsic::nvvm_suld_3d_v4i16_zero:
4156  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4157  Info.memVT = MVT::i16;
4158  Info.ptrVal = nullptr;
4159  Info.offset = 0;
4161  Info.align = Align(16);
4162  return true;
4163 
4164  case Intrinsic::nvvm_suld_1d_i32_clamp:
4165  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4166  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4167  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4168  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4169  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4170  case Intrinsic::nvvm_suld_2d_i32_clamp:
4171  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4172  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4173  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4174  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4175  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4176  case Intrinsic::nvvm_suld_3d_i32_clamp:
4177  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4178  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4179  case Intrinsic::nvvm_suld_1d_i32_trap:
4180  case Intrinsic::nvvm_suld_1d_v2i32_trap:
4181  case Intrinsic::nvvm_suld_1d_v4i32_trap:
4182  case Intrinsic::nvvm_suld_1d_array_i32_trap:
4183  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4184  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4185  case Intrinsic::nvvm_suld_2d_i32_trap:
4186  case Intrinsic::nvvm_suld_2d_v2i32_trap:
4187  case Intrinsic::nvvm_suld_2d_v4i32_trap:
4188  case Intrinsic::nvvm_suld_2d_array_i32_trap:
4189  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4190  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4191  case Intrinsic::nvvm_suld_3d_i32_trap:
4192  case Intrinsic::nvvm_suld_3d_v2i32_trap:
4193  case Intrinsic::nvvm_suld_3d_v4i32_trap:
4194  case Intrinsic::nvvm_suld_1d_i32_zero:
4195  case Intrinsic::nvvm_suld_1d_v2i32_zero:
4196  case Intrinsic::nvvm_suld_1d_v4i32_zero:
4197  case Intrinsic::nvvm_suld_1d_array_i32_zero:
4198  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4199  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4200  case Intrinsic::nvvm_suld_2d_i32_zero:
4201  case Intrinsic::nvvm_suld_2d_v2i32_zero:
4202  case Intrinsic::nvvm_suld_2d_v4i32_zero:
4203  case Intrinsic::nvvm_suld_2d_array_i32_zero:
4204  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4205  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4206  case Intrinsic::nvvm_suld_3d_i32_zero:
4207  case Intrinsic::nvvm_suld_3d_v2i32_zero:
4208  case Intrinsic::nvvm_suld_3d_v4i32_zero:
4209  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4210  Info.memVT = MVT::i32;
4211  Info.ptrVal = nullptr;
4212  Info.offset = 0;
4214  Info.align = Align(16);
4215  return true;
4216 
4217  case Intrinsic::nvvm_suld_1d_i64_clamp:
4218  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4219  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4220  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4221  case Intrinsic::nvvm_suld_2d_i64_clamp:
4222  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4223  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4224  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4225  case Intrinsic::nvvm_suld_3d_i64_clamp:
4226  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4227  case Intrinsic::nvvm_suld_1d_i64_trap:
4228  case Intrinsic::nvvm_suld_1d_v2i64_trap:
4229  case Intrinsic::nvvm_suld_1d_array_i64_trap:
4230  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4231  case Intrinsic::nvvm_suld_2d_i64_trap:
4232  case Intrinsic::nvvm_suld_2d_v2i64_trap:
4233  case Intrinsic::nvvm_suld_2d_array_i64_trap:
4234  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4235  case Intrinsic::nvvm_suld_3d_i64_trap:
4236  case Intrinsic::nvvm_suld_3d_v2i64_trap:
4237  case Intrinsic::nvvm_suld_1d_i64_zero:
4238  case Intrinsic::nvvm_suld_1d_v2i64_zero:
4239  case Intrinsic::nvvm_suld_1d_array_i64_zero:
4240  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4241  case Intrinsic::nvvm_suld_2d_i64_zero:
4242  case Intrinsic::nvvm_suld_2d_v2i64_zero:
4243  case Intrinsic::nvvm_suld_2d_array_i64_zero:
4244  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4245  case Intrinsic::nvvm_suld_3d_i64_zero:
4246  case Intrinsic::nvvm_suld_3d_v2i64_zero:
4247  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4248  Info.memVT = MVT::i64;
4249  Info.ptrVal = nullptr;
4250  Info.offset = 0;
4252  Info.align = Align(16);
4253  return true;
4254  }
4255  return false;
4256 }
4257 
4258 /// getFunctionParamOptimizedAlign - since function arguments are passed via
4259 /// .param space, we may want to increase their alignment in a way that
4260 /// ensures that we can effectively vectorize their loads & stores. We can
4261 /// increase alignment only if the function has internal or has private
4262 /// linkage as for other linkage types callers may already rely on default
4263 /// alignment. To allow using 128-bit vectorized loads/stores, this function
4264 /// ensures that alignment is 16 or greater.
4266  const Function *F, Type *ArgTy, const DataLayout &DL) const {
4267  const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
4268 
4269  // If a function has linkage different from internal or private, we
4270  // must use default ABI alignment as external users rely on it.
4271  if (!F->hasLocalLinkage())
4272  return Align(ABITypeAlign);
4273 
4274  assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4275  return Align(std::max(uint64_t(16), ABITypeAlign));
4276 }
4277 
4278 /// isLegalAddressingMode - Return true if the addressing mode represented
4279 /// by AM is legal for this target, for a load/store of the specified type.
4280 /// Used to guide target specific optimizations, like loop strength reduction
4281 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4282 /// (CodeGenPrepare.cpp)
4284  const AddrMode &AM, Type *Ty,
4285  unsigned AS, Instruction *I) const {
4286  // AddrMode - This represents an addressing mode of:
4287  // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4288  //
4289  // The legal address modes are
4290  // - [avar]
4291  // - [areg]
4292  // - [areg+immoff]
4293  // - [immAddr]
4294 
4295  if (AM.BaseGV) {
4296  return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4297  }
4298 
4299  switch (AM.Scale) {
4300  case 0: // "r", "r+i" or "i" is allowed
4301  break;
4302  case 1:
4303  if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4304  return false;
4305  // Otherwise we have r+i.
4306  break;
4307  default:
4308  // No scale > 1 is allowed
4309  return false;
4310  }
4311  return true;
4312 }
4313 
4314 //===----------------------------------------------------------------------===//
4315 // NVPTX Inline Assembly Support
4316 //===----------------------------------------------------------------------===//
4317 
4318 /// getConstraintType - Given a constraint letter, return the type of
4319 /// constraint it is for this target.
4322  if (Constraint.size() == 1) {
4323  switch (Constraint[0]) {
4324  default:
4325  break;
4326  case 'b':
4327  case 'r':
4328  case 'h':
4329  case 'c':
4330  case 'l':
4331  case 'f':
4332  case 'd':
4333  case '0':
4334  case 'N':
4335  return C_RegisterClass;
4336  }
4337  }
4338  return TargetLowering::getConstraintType(Constraint);
4339 }
4340 
4341 std::pair<unsigned, const TargetRegisterClass *>
4343  StringRef Constraint,
4344  MVT VT) const {
4345  if (Constraint.size() == 1) {
4346  switch (Constraint[0]) {
4347  case 'b':
4348  return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4349  case 'c':
4350  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4351  case 'h':
4352  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4353  case 'r':
4354  return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4355  case 'l':
4356  case 'N':
4357  return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4358  case 'f':
4359  return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4360  case 'd':
4361  return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4362  }
4363  }
4364  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4365 }
4366 
4367 //===----------------------------------------------------------------------===//
4368 // NVPTX DAG Combining
4369 //===----------------------------------------------------------------------===//
4370 
4372  CodeGenOpt::Level OptLevel) const {
4373  // Always honor command-line argument
4374  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4375  return FMAContractLevelOpt > 0;
4376 
4377  // Do not contract if we're not optimizing the code.
4378  if (OptLevel == 0)
4379  return false;
4380 
4381  // Honor TargetOptions flags that explicitly say fusion is okay.
4383  return true;
4384 
4385  return allowUnsafeFPMath(MF);
4386 }
4387 
4389  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4390  if (MF.getTarget().Options.UnsafeFPMath)
4391  return true;
4392 
4393  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4394  const Function &F = MF.getFunction();
4395  return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
4396 }
4397 
4398 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4399 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4400 /// called with the default operands, and if that fails, with commuted
4401 /// operands.
4404  const NVPTXSubtarget &Subtarget,
4405  CodeGenOpt::Level OptLevel) {
4406  SelectionDAG &DAG = DCI.DAG;
4407  // Skip non-integer, non-scalar case
4408  EVT VT=N0.getValueType();
4409  if (VT.isVector())
4410  return SDValue();
4411 
4412  // fold (add (mul a, b), c) -> (mad a, b, c)
4413  //
4414  if (N0.getOpcode() == ISD::MUL) {
4415  assert (VT.isInteger());
4416  // For integer:
4417  // Since integer multiply-add costs the same as integer multiply
4418  // but is more costly than integer add, do the fusion only when
4419  // the mul is only used in the add.
4420  if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4421  !N0.getNode()->hasOneUse())
4422  return SDValue();
4423 
4424  // Do the folding
4425  return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4426  N0.getOperand(0), N0.getOperand(1), N1);
4427  }
4428  else if (N0.getOpcode() == ISD::FMUL) {
4429  if (VT == MVT::f32 || VT == MVT::f64) {
4430  const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4431  &DAG.getTargetLoweringInfo());
4432  if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4433  return SDValue();
4434 
4435  // For floating point:
4436  // Do the fusion only when the mul has less than 5 uses and all
4437  // are add.
4438  // The heuristic is that if a use is not an add, then that use
4439  // cannot be fused into fma, therefore mul is still needed anyway.
4440  // If there are more than 4 uses, even if they are all add, fusing
4441  // them will increase register pressue.
4442  //
4443  int numUses = 0;
4444  int nonAddCount = 0;
4445  for (const SDNode *User : N0.getNode()->uses()) {
4446  numUses++;
4447  if (User->getOpcode() != ISD::FADD)
4448  ++nonAddCount;
4449  }
4450  if (numUses >= 5)
4451  return SDValue();
4452  if (nonAddCount) {
4453  int orderNo = N->getIROrder();
4454  int orderNo2 = N0.getNode()->getIROrder();
4455  // simple heuristics here for considering potential register
4456  // pressure, the logics here is that the differnce are used
4457  // to measure the distance between def and use, the longer distance
4458  // more likely cause register pressure.
4459  if (orderNo - orderNo2 < 500)
4460  return SDValue();
4461 
4462  // Now, check if at least one of the FMUL's operands is live beyond the node N,
4463  // which guarantees that the FMA will not increase register pressure at node N.
4464  bool opIsLive = false;
4465  const SDNode *left = N0.getOperand(0).getNode();
4466  const SDNode *right = N0.getOperand(1).getNode();
4467 
4468  if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4469  opIsLive = true;
4470 
4471  if (!opIsLive)
4472  for (const SDNode *User : left->uses()) {
4473  int orderNo3 = User->getIROrder();
4474  if (orderNo3 > orderNo) {
4475  opIsLive = true;
4476  break;
4477  }
4478  }
4479 
4480  if (!opIsLive)
4481  for (const SDNode *User : right->uses()) {
4482  int orderNo3 = User->getIROrder();
4483  if (orderNo3 > orderNo) {
4484  opIsLive = true;
4485  break;<