LLVM  13.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
33 #include "llvm/IR/Argument.h"
34 #include "llvm/IR/Attributes.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/GlobalValue.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/IntrinsicsNVPTX.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/Value.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CodeGen.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <sstream>
60 #include <string>
61 #include <utility>
62 #include <vector>
63 
64 #define DEBUG_TYPE "nvptx-lower"
65 
66 using namespace llvm;
67 
68 static std::atomic<unsigned> GlobalUniqueCallSite;
69 
71  "nvptx-sched4reg",
72  cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
73 
74 static cl::opt<unsigned>
76  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
77  " 1: do it 2: do it aggressively"),
78  cl::init(2));
79 
81  "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
82  cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
83  " IEEE Compliant F32 div.rnd if available."),
84  cl::init(2));
85 
87  "nvptx-prec-sqrtf32", cl::Hidden,
88  cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
89  cl::init(true));
90 
92  if (UsePrecDivF32.getNumOccurrences() > 0) {
93  // If nvptx-prec-div32=N is used on the command-line, always honor it
94  return UsePrecDivF32;
95  } else {
96  // Otherwise, use div.approx if fast math is enabled
97  if (getTargetMachine().Options.UnsafeFPMath)
98  return 0;
99  else
100  return 2;
101  }
102 }
103 
105  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
106  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
107  return UsePrecSqrtF32;
108  } else {
109  // Otherwise, use sqrt.approx if fast math is enabled
111  }
112 }
113 
117 }
118 
119 static bool IsPTXVectorType(MVT VT) {
120  switch (VT.SimpleTy) {
121  default:
122  return false;
123  case MVT::v2i1:
124  case MVT::v4i1:
125  case MVT::v2i8:
126  case MVT::v4i8:
127  case MVT::v2i16:
128  case MVT::v4i16:
129  case MVT::v2i32:
130  case MVT::v4i32:
131  case MVT::v2i64:
132  case MVT::v2f16:
133  case MVT::v4f16:
134  case MVT::v8f16: // <4 x f16x2>
135  case MVT::v2f32:
136  case MVT::v4f32:
137  case MVT::v2f64:
138  return true;
139  }
140 }
141 
142 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
143 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
144 /// into their primitive components.
145 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
146 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
147 /// LowerCall, and LowerReturn.
148 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
149  Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
151  uint64_t StartingOffset = 0) {
152  SmallVector<EVT, 16> TempVTs;
153  SmallVector<uint64_t, 16> TempOffsets;
154 
155  // Special case for i128 - decompose to (i64, i64)
156  if (Ty->isIntegerTy(128)) {
157  ValueVTs.push_back(EVT(MVT::i64));
158  ValueVTs.push_back(EVT(MVT::i64));
159 
160  if (Offsets) {
161  Offsets->push_back(StartingOffset + 0);
162  Offsets->push_back(StartingOffset + 8);
163  }
164 
165  return;
166  }
167 
168  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
169  if (StructType *STy = dyn_cast<StructType>(Ty)) {
170  auto const *SL = DL.getStructLayout(STy);
171  auto ElementNum = 0;
172  for(auto *EI : STy->elements()) {
173  ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
174  StartingOffset + SL->getElementOffset(ElementNum));
175  ++ElementNum;
176  }
177  return;
178  }
179 
180  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
181  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
182  EVT VT = TempVTs[i];
183  uint64_t Off = TempOffsets[i];
184  // Split vectors into individual elements, except for v2f16, which
185  // we will pass as a single scalar.
186  if (VT.isVector()) {
187  unsigned NumElts = VT.getVectorNumElements();
188  EVT EltVT = VT.getVectorElementType();
189  // Vectors with an even number of f16 elements will be passed to
190  // us as an array of v2f16 elements. We must match this so we
191  // stay in sync with Ins/Outs.
192  if (EltVT == MVT::f16 && NumElts % 2 == 0) {
193  EltVT = MVT::v2f16;
194  NumElts /= 2;
195  }
196  for (unsigned j = 0; j != NumElts; ++j) {
197  ValueVTs.push_back(EltVT);
198  if (Offsets)
199  Offsets->push_back(Off + j * EltVT.getStoreSize());
200  }
201  } else {
202  ValueVTs.push_back(VT);
203  if (Offsets)
204  Offsets->push_back(Off);
205  }
206  }
207 }
208 
209 // Check whether we can merge loads/stores of some of the pieces of a
210 // flattened function parameter or return value into a single vector
211 // load/store.
212 //
213 // The flattened parameter is represented as a list of EVTs and
214 // offsets, and the whole structure is aligned to ParamAlignment. This
215 // function determines whether we can load/store pieces of the
216 // parameter starting at index Idx using a single vectorized op of
217 // size AccessSize. If so, it returns the number of param pieces
218 // covered by the vector op. Otherwise, it returns 1.
220  unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
221  const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
222 
223  // Can't vectorize if param alignment is not sufficient.
224  if (ParamAlignment < AccessSize)
225  return 1;
226  // Can't vectorize if offset is not aligned.
227  if (Offsets[Idx] & (AccessSize - 1))
228  return 1;
229 
230  EVT EltVT = ValueVTs[Idx];
231  unsigned EltSize = EltVT.getStoreSize();
232 
233  // Element is too large to vectorize.
234  if (EltSize >= AccessSize)
235  return 1;
236 
237  unsigned NumElts = AccessSize / EltSize;
238  // Can't vectorize if AccessBytes if not a multiple of EltSize.
239  if (AccessSize != EltSize * NumElts)
240  return 1;
241 
242  // We don't have enough elements to vectorize.
243  if (Idx + NumElts > ValueVTs.size())
244  return 1;
245 
246  // PTX ISA can only deal with 2- and 4-element vector ops.
247  if (NumElts != 4 && NumElts != 2)
248  return 1;
249 
250  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
251  // Types do not match.
252  if (ValueVTs[j] != EltVT)
253  return 1;
254 
255  // Elements are not contiguous.
256  if (Offsets[j] - Offsets[j - 1] != EltSize)
257  return 1;
258  }
259  // OK. We can vectorize ValueVTs[i..i+NumElts)
260  return NumElts;
261 }
262 
263 // Flags for tracking per-element vectorization state of loads/stores
264 // of a flattened function parameter or return value.
266  PVF_INNER = 0x0, // Middle elements of a vector.
267  PVF_FIRST = 0x1, // First element of the vector.
268  PVF_LAST = 0x2, // Last element of the vector.
269  // Scalar is effectively a 1-element vector.
271 };
272 
273 // Computes whether and how we can vectorize the loads/stores of a
274 // flattened function parameter or return value.
275 //
276 // The flattened parameter is represented as the list of ValueVTs and
277 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
278 // of the same size as ValueVTs indicating how each piece should be
279 // loaded/stored (i.e. as a scalar, or as part of a vector
280 // load/store).
284  Align ParamAlignment) {
285  // Set vector size to match ValueVTs and mark all elements as
286  // scalars by default.
288  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
289 
290  // Check what we can vectorize using 128/64/32-bit accesses.
291  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
292  // Skip elements we've already processed.
293  assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
294  for (unsigned AccessSize : {16, 8, 4, 2}) {
295  unsigned NumElts = CanMergeParamLoadStoresStartingAt(
296  I, AccessSize, ValueVTs, Offsets, ParamAlignment);
297  // Mark vectorized elements.
298  switch (NumElts) {
299  default:
300  llvm_unreachable("Unexpected return value");
301  case 1:
302  // Can't vectorize using this size, try next smaller size.
303  continue;
304  case 2:
305  assert(I + 1 < E && "Not enough elements.");
306  VectorInfo[I] = PVF_FIRST;
307  VectorInfo[I + 1] = PVF_LAST;
308  I += 1;
309  break;
310  case 4:
311  assert(I + 3 < E && "Not enough elements.");
312  VectorInfo[I] = PVF_FIRST;
313  VectorInfo[I + 1] = PVF_INNER;
314  VectorInfo[I + 2] = PVF_INNER;
315  VectorInfo[I + 3] = PVF_LAST;
316  I += 3;
317  break;
318  }
319  // Break out of the inner loop because we've already succeeded
320  // using largest possible AccessSize.
321  break;
322  }
323  }
324  return VectorInfo;
325 }
326 
327 // NVPTXTargetLowering Constructor.
329  const NVPTXSubtarget &STI)
330  : TargetLowering(TM), nvTM(&TM), STI(STI) {
331  // always lower memset, memcpy, and memmove intrinsics to load/store
332  // instructions, rather
333  // then generating calls to memset, mempcy or memmove.
334  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
335  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
336  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
337 
340 
341  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
342  // condition branches.
343  setJumpIsExpensive(true);
344 
345  // Wide divides are _very_ slow. Try to reduce the width of the divide if
346  // possible.
347  addBypassSlowDiv(64, 32);
348 
349  // By default, use the Source scheduling
350  if (sched4reg)
352  else
354 
355  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
356  LegalizeAction NoF16Action) {
357  setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
358  };
359 
360  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
361  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
362  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
363  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
364  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
365  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
366  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
367  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
368 
369  // Conversion to/from FP16/FP16x2 is always legal.
376 
377  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
378  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
379 
380  // Operations not directly supported by NVPTX.
385  }
386 
387  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
388  // For others we will expand to a SHL/SRA pair.
394 
401 
404 
405  // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
406  // that don't have h/w rotation we lower them to multi-instruction assembly.
407  // See ROT*_sw in NVPTXIntrInfo.td
412 
420 
421  // Indirect branch is not supported.
422  // This also disables Jump Table creation.
425 
428 
429  // We want to legalize constant related memmove and memcopy
430  // intrinsics.
432 
433  // Turn FP extload into load/fpextend
443  // Turn FP truncstore into trunc + store.
444  // FIXME: vector types should also be expanded
448 
449  // PTX does not support load / store predicate registers
452 
453  for (MVT VT : MVT::integer_valuetypes()) {
457  }
458 
459  // This is legal in NVPTX
463 
464  // TRAP can be lowered to PTX trap
466 
467  // Register custom handling for vector loads/stores
468  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
469  if (IsPTXVectorType(VT)) {
473  }
474  }
475 
476  // Custom handling for i8 intrinsics
478 
479  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
485 
488  }
489 
493 
494  // PTX does not directly support SELP of i1, so promote to i32 first
496 
497  // PTX cannot multiply two i64s in a single instruction.
500 
501  // We have some custom DAG combine patterns for these nodes
509 
510  // setcc for f16x2 needs special handling to prevent legalizer's
511  // attempt to scalarize it due to v2i1 not being legal.
512  if (STI.allowFP16Math())
514 
515  // Promote fp16 arithmetic if fp16 hardware isn't available or the
516  // user passed --nvptx-no-fp16-math. The flag is useful because,
517  // although sm_53+ GPUs have some sort of FP16 support in
518  // hardware, only sm_53 and sm_60 have full implementation. Others
519  // only have token amount of hardware and are likely to run faster
520  // by using fp32 units instead.
521  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
522  setFP16OperationAction(Op, MVT::f16, Legal, Promote);
523  setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
524  }
525 
526  // There's no neg.f16 instruction. Expand to (0-x).
529 
530  // (would be) Library functions.
531 
532  // These map to conversion instructions for scalar FP types.
533  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
534  ISD::FTRUNC}) {
539  }
540 
545 
546 
547  // 'Expand' implements FCOPYSIGN without calling an external library.
552 
553  // These map to corresponding instructions for f32/f64. f16 must be
554  // promoted to f32. v2f16 is expanded to f16, which is then promoted
555  // to f32.
556  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
562  }
567 
568  // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
569  // No FPOW or FREM in PTX.
570 
571  // Now deduce the information based on the above mentioned
572  // actions
574 }
575 
576 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
577  switch ((NVPTXISD::NodeType)Opcode) {
579  break;
580  case NVPTXISD::CALL:
581  return "NVPTXISD::CALL";
582  case NVPTXISD::RET_FLAG:
583  return "NVPTXISD::RET_FLAG";
585  return "NVPTXISD::LOAD_PARAM";
586  case NVPTXISD::Wrapper:
587  return "NVPTXISD::Wrapper";
589  return "NVPTXISD::DeclareParam";
591  return "NVPTXISD::DeclareScalarParam";
593  return "NVPTXISD::DeclareRet";
595  return "NVPTXISD::DeclareScalarRet";
597  return "NVPTXISD::DeclareRetParam";
598  case NVPTXISD::PrintCall:
599  return "NVPTXISD::PrintCall";
601  return "NVPTXISD::PrintConvergentCall";
603  return "NVPTXISD::PrintCallUni";
605  return "NVPTXISD::PrintConvergentCallUni";
606  case NVPTXISD::LoadParam:
607  return "NVPTXISD::LoadParam";
609  return "NVPTXISD::LoadParamV2";
611  return "NVPTXISD::LoadParamV4";
613  return "NVPTXISD::StoreParam";
615  return "NVPTXISD::StoreParamV2";
617  return "NVPTXISD::StoreParamV4";
619  return "NVPTXISD::StoreParamS32";
621  return "NVPTXISD::StoreParamU32";
623  return "NVPTXISD::CallArgBegin";
624  case NVPTXISD::CallArg:
625  return "NVPTXISD::CallArg";
627  return "NVPTXISD::LastCallArg";
629  return "NVPTXISD::CallArgEnd";
630  case NVPTXISD::CallVoid:
631  return "NVPTXISD::CallVoid";
632  case NVPTXISD::CallVal:
633  return "NVPTXISD::CallVal";
635  return "NVPTXISD::CallSymbol";
636  case NVPTXISD::Prototype:
637  return "NVPTXISD::Prototype";
638  case NVPTXISD::MoveParam:
639  return "NVPTXISD::MoveParam";
641  return "NVPTXISD::StoreRetval";
643  return "NVPTXISD::StoreRetvalV2";
645  return "NVPTXISD::StoreRetvalV4";
647  return "NVPTXISD::PseudoUseParam";
648  case NVPTXISD::RETURN:
649  return "NVPTXISD::RETURN";
651  return "NVPTXISD::CallSeqBegin";
653  return "NVPTXISD::CallSeqEnd";
655  return "NVPTXISD::CallPrototype";
656  case NVPTXISD::ProxyReg:
657  return "NVPTXISD::ProxyReg";
658  case NVPTXISD::LoadV2:
659  return "NVPTXISD::LoadV2";
660  case NVPTXISD::LoadV4:
661  return "NVPTXISD::LoadV4";
662  case NVPTXISD::LDGV2:
663  return "NVPTXISD::LDGV2";
664  case NVPTXISD::LDGV4:
665  return "NVPTXISD::LDGV4";
666  case NVPTXISD::LDUV2:
667  return "NVPTXISD::LDUV2";
668  case NVPTXISD::LDUV4:
669  return "NVPTXISD::LDUV4";
670  case NVPTXISD::StoreV2:
671  return "NVPTXISD::StoreV2";
672  case NVPTXISD::StoreV4:
673  return "NVPTXISD::StoreV4";
675  return "NVPTXISD::FUN_SHFL_CLAMP";
677  return "NVPTXISD::FUN_SHFR_CLAMP";
678  case NVPTXISD::IMAD:
679  return "NVPTXISD::IMAD";
681  return "NVPTXISD::SETP_F16X2";
682  case NVPTXISD::Dummy:
683  return "NVPTXISD::Dummy";
685  return "NVPTXISD::MUL_WIDE_SIGNED";
687  return "NVPTXISD::MUL_WIDE_UNSIGNED";
688  case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
689  case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
691  return "NVPTXISD::Tex1DFloatFloatLevel";
693  return "NVPTXISD::Tex1DFloatFloatGrad";
694  case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
695  case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
697  return "NVPTXISD::Tex1DS32FloatLevel";
699  return "NVPTXISD::Tex1DS32FloatGrad";
700  case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
701  case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
703  return "NVPTXISD::Tex1DU32FloatLevel";
705  return "NVPTXISD::Tex1DU32FloatGrad";
706  case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
707  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
709  return "NVPTXISD::Tex1DArrayFloatFloatLevel";
711  return "NVPTXISD::Tex1DArrayFloatFloatGrad";
712  case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
713  case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
715  return "NVPTXISD::Tex1DArrayS32FloatLevel";
717  return "NVPTXISD::Tex1DArrayS32FloatGrad";
718  case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
719  case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
721  return "NVPTXISD::Tex1DArrayU32FloatLevel";
723  return "NVPTXISD::Tex1DArrayU32FloatGrad";
724  case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
725  case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
727  return "NVPTXISD::Tex2DFloatFloatLevel";
729  return "NVPTXISD::Tex2DFloatFloatGrad";
730  case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
731  case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
733  return "NVPTXISD::Tex2DS32FloatLevel";
735  return "NVPTXISD::Tex2DS32FloatGrad";
736  case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
737  case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
739  return "NVPTXISD::Tex2DU32FloatLevel";
741  return "NVPTXISD::Tex2DU32FloatGrad";
742  case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
743  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
745  return "NVPTXISD::Tex2DArrayFloatFloatLevel";
747  return "NVPTXISD::Tex2DArrayFloatFloatGrad";
748  case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
749  case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
751  return "NVPTXISD::Tex2DArrayS32FloatLevel";
753  return "NVPTXISD::Tex2DArrayS32FloatGrad";
754  case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
755  case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
757  return "NVPTXISD::Tex2DArrayU32FloatLevel";
759  return "NVPTXISD::Tex2DArrayU32FloatGrad";
760  case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
761  case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
763  return "NVPTXISD::Tex3DFloatFloatLevel";
765  return "NVPTXISD::Tex3DFloatFloatGrad";
766  case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
767  case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
769  return "NVPTXISD::Tex3DS32FloatLevel";
771  return "NVPTXISD::Tex3DS32FloatGrad";
772  case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
773  case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
775  return "NVPTXISD::Tex3DU32FloatLevel";
777  return "NVPTXISD::Tex3DU32FloatGrad";
778  case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
780  return "NVPTXISD::TexCubeFloatFloatLevel";
781  case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
783  return "NVPTXISD::TexCubeS32FloatLevel";
784  case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
786  return "NVPTXISD::TexCubeU32FloatLevel";
788  return "NVPTXISD::TexCubeArrayFloatFloat";
790  return "NVPTXISD::TexCubeArrayFloatFloatLevel";
792  return "NVPTXISD::TexCubeArrayS32Float";
794  return "NVPTXISD::TexCubeArrayS32FloatLevel";
796  return "NVPTXISD::TexCubeArrayU32Float";
798  return "NVPTXISD::TexCubeArrayU32FloatLevel";
800  return "NVPTXISD::Tld4R2DFloatFloat";
802  return "NVPTXISD::Tld4G2DFloatFloat";
804  return "NVPTXISD::Tld4B2DFloatFloat";
806  return "NVPTXISD::Tld4A2DFloatFloat";
808  return "NVPTXISD::Tld4R2DS64Float";
810  return "NVPTXISD::Tld4G2DS64Float";
812  return "NVPTXISD::Tld4B2DS64Float";
814  return "NVPTXISD::Tld4A2DS64Float";
816  return "NVPTXISD::Tld4R2DU64Float";
818  return "NVPTXISD::Tld4G2DU64Float";
820  return "NVPTXISD::Tld4B2DU64Float";
822  return "NVPTXISD::Tld4A2DU64Float";
823 
825  return "NVPTXISD::TexUnified1DFloatS32";
827  return "NVPTXISD::TexUnified1DFloatFloat";
829  return "NVPTXISD::TexUnified1DFloatFloatLevel";
831  return "NVPTXISD::TexUnified1DFloatFloatGrad";
833  return "NVPTXISD::TexUnified1DS32S32";
835  return "NVPTXISD::TexUnified1DS32Float";
837  return "NVPTXISD::TexUnified1DS32FloatLevel";
839  return "NVPTXISD::TexUnified1DS32FloatGrad";
841  return "NVPTXISD::TexUnified1DU32S32";
843  return "NVPTXISD::TexUnified1DU32Float";
845  return "NVPTXISD::TexUnified1DU32FloatLevel";
847  return "NVPTXISD::TexUnified1DU32FloatGrad";
849  return "NVPTXISD::TexUnified1DArrayFloatS32";
851  return "NVPTXISD::TexUnified1DArrayFloatFloat";
853  return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
855  return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
857  return "NVPTXISD::TexUnified1DArrayS32S32";
859  return "NVPTXISD::TexUnified1DArrayS32Float";
861  return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
863  return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
865  return "NVPTXISD::TexUnified1DArrayU32S32";
867  return "NVPTXISD::TexUnified1DArrayU32Float";
869  return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
871  return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
873  return "NVPTXISD::TexUnified2DFloatS32";
875  return "NVPTXISD::TexUnified2DFloatFloat";
877  return "NVPTXISD::TexUnified2DFloatFloatLevel";
879  return "NVPTXISD::TexUnified2DFloatFloatGrad";
881  return "NVPTXISD::TexUnified2DS32S32";
883  return "NVPTXISD::TexUnified2DS32Float";
885  return "NVPTXISD::TexUnified2DS32FloatLevel";
887  return "NVPTXISD::TexUnified2DS32FloatGrad";
889  return "NVPTXISD::TexUnified2DU32S32";
891  return "NVPTXISD::TexUnified2DU32Float";
893  return "NVPTXISD::TexUnified2DU32FloatLevel";
895  return "NVPTXISD::TexUnified2DU32FloatGrad";
897  return "NVPTXISD::TexUnified2DArrayFloatS32";
899  return "NVPTXISD::TexUnified2DArrayFloatFloat";
901  return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
903  return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
905  return "NVPTXISD::TexUnified2DArrayS32S32";
907  return "NVPTXISD::TexUnified2DArrayS32Float";
909  return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
911  return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
913  return "NVPTXISD::TexUnified2DArrayU32S32";
915  return "NVPTXISD::TexUnified2DArrayU32Float";
917  return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
919  return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
921  return "NVPTXISD::TexUnified3DFloatS32";
923  return "NVPTXISD::TexUnified3DFloatFloat";
925  return "NVPTXISD::TexUnified3DFloatFloatLevel";
927  return "NVPTXISD::TexUnified3DFloatFloatGrad";
929  return "NVPTXISD::TexUnified3DS32S32";
931  return "NVPTXISD::TexUnified3DS32Float";
933  return "NVPTXISD::TexUnified3DS32FloatLevel";
935  return "NVPTXISD::TexUnified3DS32FloatGrad";
937  return "NVPTXISD::TexUnified3DU32S32";
939  return "NVPTXISD::TexUnified3DU32Float";
941  return "NVPTXISD::TexUnified3DU32FloatLevel";
943  return "NVPTXISD::TexUnified3DU32FloatGrad";
945  return "NVPTXISD::TexUnifiedCubeFloatFloat";
947  return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
949  return "NVPTXISD::TexUnifiedCubeS32Float";
951  return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
953  return "NVPTXISD::TexUnifiedCubeU32Float";
955  return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
957  return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
959  return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
961  return "NVPTXISD::TexUnifiedCubeArrayS32Float";
963  return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
965  return "NVPTXISD::TexUnifiedCubeArrayU32Float";
967  return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
969  return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
971  return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
973  return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
975  return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
977  return "NVPTXISD::Tld4UnifiedR2DS64Float";
979  return "NVPTXISD::Tld4UnifiedG2DS64Float";
981  return "NVPTXISD::Tld4UnifiedB2DS64Float";
983  return "NVPTXISD::Tld4UnifiedA2DS64Float";
985  return "NVPTXISD::Tld4UnifiedR2DU64Float";
987  return "NVPTXISD::Tld4UnifiedG2DU64Float";
989  return "NVPTXISD::Tld4UnifiedB2DU64Float";
991  return "NVPTXISD::Tld4UnifiedA2DU64Float";
992 
993  case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
994  case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
995  case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
996  case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
997  case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
998  case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
999  case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1000  case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1001  case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1002  case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1003  case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1004 
1005  case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1006  case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1007  case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1008  case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1009  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1010  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1011  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1012  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1013  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1014  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1015  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1016 
1017  case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1018  case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1019  case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1020  case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1021  case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1022  case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1023  case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1024  case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1025  case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1026  case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1027  case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1028 
1029  case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1030  case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1031  case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1032  case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1033  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1034  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1035  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1036  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1037  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1038  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1039  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1040 
1041  case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1042  case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1043  case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1044  case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1045  case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1046  case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1047  case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1048  case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1049  case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1050  case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1051  case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1052 
1053  case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1054  case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1055  case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1056  case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1057  case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1058  case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1059  case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1060  case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1061  case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1062  case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1063  case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1064 
1065  case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1066  case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1067  case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1068  case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1069  case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1070  case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1071  case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1072  case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1073  case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1074  case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1075  case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1076 
1077  case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1078  case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1079  case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1080  case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1081  case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1082  case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1083  case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1084  case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1085  case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1086  case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1087  case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1088 
1089  case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1090  case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1091  case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1092  case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1093  case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1094  case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1095  case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1096  case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1097  case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1098  case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1099  case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1100 
1101  case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1102  case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1103  case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1104  case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1105  case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1106  case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1107  case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1108  case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1109  case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1110  case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1111  case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1112 
1113  case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1114  case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1115  case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1116  case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1117  case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1118  case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1119  case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1120  case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1121  case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1122  case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1123  case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1124 
1125  case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1126  case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1127  case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1128  case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1129  case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1130  case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1131  case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1132  case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1133  case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1134  case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1135  case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1136 
1137  case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1138  case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1139  case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1140  case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1141  case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1142  case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1143  case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1144  case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1145  case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1146  case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1147  case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1148 
1149  case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1150  case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1151  case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1152  case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1153  case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1154  case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1155  case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1156  case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1157  case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1158  case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1159  case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1160 
1161  case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1162  case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1163  case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1164  case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1165  case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1166  case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1167  case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1168  case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1169  case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1170  case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1171  case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1172  }
1173  return nullptr;
1174 }
1175 
1178  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1179  return TypeSplitVector;
1180  if (VT == MVT::v2f16)
1181  return TypeLegal;
1183 }
1184 
1186  int Enabled, int &ExtraSteps,
1187  bool &UseOneConst,
1188  bool Reciprocal) const {
1190  (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1191  return SDValue();
1192 
1193  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1194  ExtraSteps = 0;
1195 
1196  SDLoc DL(Operand);
1197  EVT VT = Operand.getValueType();
1198  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1199 
1200  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1201  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1202  DAG.getConstant(IID, DL, MVT::i32), Operand);
1203  };
1204 
1205  // The sqrt and rsqrt refinement processes assume we always start out with an
1206  // approximation of the rsqrt. Therefore, if we're going to do any refinement
1207  // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1208  // any refinement, we must return a regular sqrt.
1209  if (Reciprocal || ExtraSteps > 0) {
1210  if (VT == MVT::f32)
1211  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1212  : Intrinsic::nvvm_rsqrt_approx_f);
1213  else if (VT == MVT::f64)
1214  return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1215  else
1216  return SDValue();
1217  } else {
1218  if (VT == MVT::f32)
1219  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1220  : Intrinsic::nvvm_sqrt_approx_f);
1221  else {
1222  // There's no sqrt.approx.f64 instruction, so we emit
1223  // reciprocal(rsqrt(x)). This is faster than
1224  // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1225  // x * rsqrt(x).)
1226  return DAG.getNode(
1228  DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1229  MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1230  }
1231  }
1232 }
1233 
1234 SDValue
1236  SDLoc dl(Op);
1237  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1238  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1239  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1240  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1241 }
1242 
1244  const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1245  const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1246  const CallBase &CB, unsigned UniqueCallSite) const {
1247  auto PtrVT = getPointerTy(DL);
1248 
1249  bool isABI = (STI.getSmVersion() >= 20);
1250  assert(isABI && "Non-ABI compilation is not supported");
1251  if (!isABI)
1252  return "";
1253 
1254  std::stringstream O;
1255  O << "prototype_" << UniqueCallSite << " : .callprototype ";
1256 
1257  if (retTy->getTypeID() == Type::VoidTyID) {
1258  O << "()";
1259  } else {
1260  O << "(";
1261  if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1262  unsigned size = 0;
1263  if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1264  size = ITy->getBitWidth();
1265  } else {
1266  assert(retTy->isFloatingPointTy() &&
1267  "Floating point type expected here");
1268  size = retTy->getPrimitiveSizeInBits();
1269  }
1270  // PTX ABI requires all scalar return values to be at least 32
1271  // bits in size. fp16 normally uses .b16 as its storage type in
1272  // PTX, so its size must be adjusted here, too.
1273  if (size < 32)
1274  size = 32;
1275 
1276  O << ".param .b" << size << " _";
1277  } else if (isa<PointerType>(retTy)) {
1278  O << ".param .b" << PtrVT.getSizeInBits() << " _";
1279  } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
1280  retTy->isIntegerTy(128)) {
1281  O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1282  << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1283  } else {
1284  llvm_unreachable("Unknown return type");
1285  }
1286  O << ") ";
1287  }
1288  O << "_ (";
1289 
1290  bool first = true;
1291 
1292  unsigned OIdx = 0;
1293  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1294  Type *Ty = Args[i].Ty;
1295  if (!first) {
1296  O << ", ";
1297  }
1298  first = false;
1299 
1300  if (!Outs[OIdx].Flags.isByVal()) {
1301  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1302  unsigned align = 0;
1303  const CallInst *CallI = cast<CallInst>(&CB);
1304  // +1 because index 0 is reserved for return type alignment
1305  if (!getAlign(*CallI, i + 1, align))
1306  align = DL.getABITypeAlignment(Ty);
1307  unsigned sz = DL.getTypeAllocSize(Ty);
1308  O << ".param .align " << align << " .b8 ";
1309  O << "_";
1310  O << "[" << sz << "]";
1311  // update the index for Outs
1312  SmallVector<EVT, 16> vtparts;
1313  ComputeValueVTs(*this, DL, Ty, vtparts);
1314  if (unsigned len = vtparts.size())
1315  OIdx += len - 1;
1316  continue;
1317  }
1318  // i8 types in IR will be i16 types in SDAG
1319  assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1320  (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1321  "type mismatch between callee prototype and arguments");
1322  // scalar type
1323  unsigned sz = 0;
1324  if (isa<IntegerType>(Ty)) {
1325  sz = cast<IntegerType>(Ty)->getBitWidth();
1326  if (sz < 32)
1327  sz = 32;
1328  } else if (isa<PointerType>(Ty)) {
1329  sz = PtrVT.getSizeInBits();
1330  } else if (Ty->isHalfTy())
1331  // PTX ABI requires all scalar parameters to be at least 32
1332  // bits in size. fp16 normally uses .b16 as its storage type
1333  // in PTX, so its size must be adjusted here, too.
1334  sz = 32;
1335  else
1336  sz = Ty->getPrimitiveSizeInBits();
1337  O << ".param .b" << sz << " ";
1338  O << "_";
1339  continue;
1340  }
1341  auto *PTy = dyn_cast<PointerType>(Ty);
1342  assert(PTy && "Param with byval attribute should be a pointer type");
1343  Type *ETy = PTy->getElementType();
1344 
1345  Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
1346  unsigned sz = DL.getTypeAllocSize(ETy);
1347  O << ".param .align " << align.value() << " .b8 ";
1348  O << "_";
1349  O << "[" << sz << "]";
1350  }
1351  O << ");";
1352  return O.str();
1353 }
1354 
1355 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1356  const CallBase *CB, Type *Ty,
1357  unsigned Idx,
1358  const DataLayout &DL) const {
1359  if (!CB) {
1360  // CallSite is zero, fallback to ABI type alignment
1361  return DL.getABITypeAlign(Ty);
1362  }
1363 
1364  unsigned Alignment = 0;
1365  const Function *DirectCallee = CB->getCalledFunction();
1366 
1367  if (!DirectCallee) {
1368  // We don't have a direct function symbol, but that may be because of
1369  // constant cast instructions in the call.
1370 
1371  // With bitcast'd call targets, the instruction will be the call
1372  if (const auto *CI = dyn_cast<CallInst>(CB)) {
1373  // Check if we have call alignment metadata
1374  if (getAlign(*CI, Idx, Alignment))
1375  return Align(Alignment);
1376 
1377  const Value *CalleeV = CI->getCalledOperand();
1378  // Ignore any bitcast instructions
1379  while (isa<ConstantExpr>(CalleeV)) {
1380  const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1381  if (!CE->isCast())
1382  break;
1383  // Look through the bitcast
1384  CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1385  }
1386 
1387  // We have now looked past all of the bitcasts. Do we finally have a
1388  // Function?
1389  if (const auto *CalleeF = dyn_cast<Function>(CalleeV))
1390  DirectCallee = CalleeF;
1391  }
1392  }
1393 
1394  // Check for function alignment information if we found that the
1395  // ultimate target is a Function
1396  if (DirectCallee)
1397  if (getAlign(*DirectCallee, Idx, Alignment))
1398  return Align(Alignment);
1399 
1400  // Call is indirect or alignment information is not available, fall back to
1401  // the ABI type alignment
1402  return DL.getABITypeAlign(Ty);
1403 }
1404 
1406  SmallVectorImpl<SDValue> &InVals) const {
1407  SelectionDAG &DAG = CLI.DAG;
1408  SDLoc dl = CLI.DL;
1410  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1412  SDValue Chain = CLI.Chain;
1413  SDValue Callee = CLI.Callee;
1414  bool &isTailCall = CLI.IsTailCall;
1415  ArgListTy &Args = CLI.getArgs();
1416  Type *RetTy = CLI.RetTy;
1417  const CallBase *CB = CLI.CB;
1418  const DataLayout &DL = DAG.getDataLayout();
1419 
1420  bool isABI = (STI.getSmVersion() >= 20);
1421  assert(isABI && "Non-ABI compilation is not supported");
1422  if (!isABI)
1423  return Chain;
1424 
1425  unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1426  SDValue tempChain = Chain;
1427  Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1428  SDValue InFlag = Chain.getValue(1);
1429 
1430  unsigned paramCount = 0;
1431  // Args.size() and Outs.size() need not match.
1432  // Outs.size() will be larger
1433  // * if there is an aggregate argument with multiple fields (each field
1434  // showing up separately in Outs)
1435  // * if there is a vector argument with more than typical vector-length
1436  // elements (generally if more than 4) where each vector element is
1437  // individually present in Outs.
1438  // So a different index should be used for indexing into Outs/OutVals.
1439  // See similar issue in LowerFormalArguments.
1440  unsigned OIdx = 0;
1441  // Declare the .params or .reg need to pass values
1442  // to the function
1443  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1444  EVT VT = Outs[OIdx].VT;
1445  Type *Ty = Args[i].Ty;
1446 
1447  if (!Outs[OIdx].Flags.isByVal()) {
1450  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1451  Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL);
1452  unsigned AllocSize = DL.getTypeAllocSize(Ty);
1453  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1454  bool NeedAlign; // Does argument declaration specify alignment?
1455  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1456  // declare .param .align <align> .b8 .param<n>[<size>];
1457  SDValue DeclareParamOps[] = {
1458  Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1459  DAG.getConstant(paramCount, dl, MVT::i32),
1460  DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1461  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1462  DeclareParamOps);
1463  NeedAlign = true;
1464  } else {
1465  // declare .param .b<size> .param<n>;
1466  if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1467  // PTX ABI requires integral types to be at least 32 bits in
1468  // size. FP16 is loaded/stored using i16, so it's handled
1469  // here as well.
1470  AllocSize = 4;
1471  }
1472  SDValue DeclareScalarParamOps[] = {
1473  Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1474  DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1475  DAG.getConstant(0, dl, MVT::i32), InFlag};
1476  Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1477  DeclareScalarParamOps);
1478  NeedAlign = false;
1479  }
1480  InFlag = Chain.getValue(1);
1481 
1482  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1483  // than 32-bits are sign extended or zero extended, depending on
1484  // whether they are signed or unsigned types. This case applies
1485  // only to scalar parameters and not to aggregate values.
1486  bool ExtendIntegerParam =
1487  Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1488 
1489  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1490  SmallVector<SDValue, 6> StoreOperands;
1491  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1492  // New store.
1493  if (VectorInfo[j] & PVF_FIRST) {
1494  assert(StoreOperands.empty() && "Unfinished preceding store.");
1495  StoreOperands.push_back(Chain);
1496  StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1497  StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1498  }
1499 
1500  EVT EltVT = VTs[j];
1501  SDValue StVal = OutVals[OIdx];
1502  if (ExtendIntegerParam) {
1503  assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1504  // zext/sext to i32
1505  StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1506  : ISD::ZERO_EXTEND,
1507  dl, MVT::i32, StVal);
1508  } else if (EltVT.getSizeInBits() < 16) {
1509  // Use 16-bit registers for small stores as it's the
1510  // smallest general purpose register size supported by NVPTX.
1511  StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1512  }
1513 
1514  // Record the value to store.
1515  StoreOperands.push_back(StVal);
1516 
1517  if (VectorInfo[j] & PVF_LAST) {
1518  unsigned NumElts = StoreOperands.size() - 3;
1520  switch (NumElts) {
1521  case 1:
1523  break;
1524  case 2:
1526  break;
1527  case 4:
1529  break;
1530  default:
1531  llvm_unreachable("Invalid vector info.");
1532  }
1533 
1534  StoreOperands.push_back(InFlag);
1535 
1536  // Adjust type of the store op if we've extended the scalar
1537  // return value.
1538  EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1539  MaybeAlign EltAlign;
1540  if (NeedAlign)
1541  EltAlign = commonAlignment(ArgAlign, Offsets[j]);
1542 
1543  Chain = DAG.getMemIntrinsicNode(
1544  Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1545  TheStoreType, MachinePointerInfo(), EltAlign,
1547  InFlag = Chain.getValue(1);
1548 
1549  // Cleanup.
1550  StoreOperands.clear();
1551  }
1552  ++OIdx;
1553  }
1554  assert(StoreOperands.empty() && "Unfinished parameter store.");
1555  if (VTs.size() > 0)
1556  --OIdx;
1557  ++paramCount;
1558  continue;
1559  }
1560 
1561  // ByVal arguments
1564  auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1565  assert(PTy && "Type of a byval parameter should be pointer");
1566  ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1567 
1568  // declare .param .align <align> .b8 .param<n>[<size>];
1569  unsigned sz = Outs[OIdx].Flags.getByValSize();
1570  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1571  Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1572  // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1573  // so we don't need to worry about natural alignment or not.
1574  // See TargetLowering::LowerCallTo().
1575 
1576  // Enforce minumum alignment of 4 to work around ptxas miscompile
1577  // for sm_50+. See corresponding alignment adjustment in
1578  // emitFunctionParamList() for details.
1579  if (ArgAlign < Align(4))
1580  ArgAlign = Align(4);
1581  SDValue DeclareParamOps[] = {
1582  Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1583  DAG.getConstant(paramCount, dl, MVT::i32),
1584  DAG.getConstant(sz, dl, MVT::i32), InFlag};
1585  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1586  DeclareParamOps);
1587  InFlag = Chain.getValue(1);
1588  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1589  EVT elemtype = VTs[j];
1590  int curOffset = Offsets[j];
1591  unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
1592  auto PtrVT = getPointerTy(DL);
1593  SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1594  DAG.getConstant(curOffset, dl, PtrVT));
1595  SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1596  MachinePointerInfo(), PartAlign);
1597  if (elemtype.getSizeInBits() < 16) {
1598  theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1599  }
1600  SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1601  SDValue CopyParamOps[] = { Chain,
1602  DAG.getConstant(paramCount, dl, MVT::i32),
1603  DAG.getConstant(curOffset, dl, MVT::i32),
1604  theVal, InFlag };
1605  Chain = DAG.getMemIntrinsicNode(
1606  NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
1608 
1609  InFlag = Chain.getValue(1);
1610  }
1611  ++paramCount;
1612  }
1613 
1614  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1615  MaybeAlign retAlignment = None;
1616 
1617  // Handle Result
1618  if (Ins.size() > 0) {
1619  SmallVector<EVT, 16> resvtparts;
1620  ComputeValueVTs(*this, DL, RetTy, resvtparts);
1621 
1622  // Declare
1623  // .param .align 16 .b8 retval0[<size-in-bytes>], or
1624  // .param .b<size-in-bits> retval0
1625  unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1626  // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1627  // these three types to match the logic in
1628  // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1629  // Plus, this behavior is consistent with nvcc's.
1630  if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1631  (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1632  // Scalar needs to be at least 32bit wide
1633  if (resultsz < 32)
1634  resultsz = 32;
1635  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1636  SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1637  DAG.getConstant(resultsz, dl, MVT::i32),
1638  DAG.getConstant(0, dl, MVT::i32), InFlag };
1639  Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1640  DeclareRetOps);
1641  InFlag = Chain.getValue(1);
1642  } else {
1643  retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1644  assert(retAlignment && "retAlignment is guaranteed to be set");
1645  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1646  SDValue DeclareRetOps[] = {
1647  Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1648  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1649  DAG.getConstant(0, dl, MVT::i32), InFlag};
1650  Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1651  DeclareRetOps);
1652  InFlag = Chain.getValue(1);
1653  }
1654  }
1655 
1656  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1657  // between them we must rely on the call site value which is valid for
1658  // indirect calls but is always null for libcalls.
1659  bool isIndirectCall = !Func && CB;
1660 
1661  if (isa<ExternalSymbolSDNode>(Callee)) {
1662  Function* CalleeFunc = nullptr;
1663 
1664  // Try to find the callee in the current module.
1665  Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1666  assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1667 
1668  // Set the "libcall callee" attribute to indicate that the function
1669  // must always have a declaration.
1670  CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1671  }
1672 
1673  if (isIndirectCall) {
1674  // This is indirect function call case : PTX requires a prototype of the
1675  // form
1676  // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1677  // to be emitted, and the label has to used as the last arg of call
1678  // instruction.
1679  // The prototype is embedded in a string and put as the operand for a
1680  // CallPrototype SDNode which will print out to the value of the string.
1681  SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1682  std::string Proto =
1683  getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite);
1684  const char *ProtoStr =
1685  nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1686  SDValue ProtoOps[] = {
1687  Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1688  };
1689  Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1690  InFlag = Chain.getValue(1);
1691  }
1692  // Op to just print "call"
1693  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1694  SDValue PrintCallOps[] = {
1695  Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1696  };
1697  // We model convergent calls as separate opcodes.
1699  if (CLI.IsConvergent)
1702  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1703  InFlag = Chain.getValue(1);
1704 
1705  // Ops to print out the function name
1706  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1707  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1708  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1709  InFlag = Chain.getValue(1);
1710 
1711  // Ops to print out the param list
1712  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1713  SDValue CallArgBeginOps[] = { Chain, InFlag };
1714  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1715  CallArgBeginOps);
1716  InFlag = Chain.getValue(1);
1717 
1718  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1719  unsigned opcode;
1720  if (i == (e - 1))
1721  opcode = NVPTXISD::LastCallArg;
1722  else
1723  opcode = NVPTXISD::CallArg;
1724  SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1725  SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1726  DAG.getConstant(i, dl, MVT::i32), InFlag };
1727  Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1728  InFlag = Chain.getValue(1);
1729  }
1730  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1731  SDValue CallArgEndOps[] = { Chain,
1732  DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1733  InFlag };
1734  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1735  InFlag = Chain.getValue(1);
1736 
1737  if (isIndirectCall) {
1738  SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1739  SDValue PrototypeOps[] = {
1740  Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
1741  Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1742  InFlag = Chain.getValue(1);
1743  }
1744 
1745  SmallVector<SDValue, 16> ProxyRegOps;
1746  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1747 
1748  // Generate loads from param memory/moves from registers for result
1749  if (Ins.size() > 0) {
1752  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1753  assert(VTs.size() == Ins.size() && "Bad value decomposition");
1754 
1755  Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1756  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1757 
1758  SmallVector<EVT, 6> LoadVTs;
1759  int VecIdx = -1; // Index of the first element of the vector.
1760 
1761  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1762  // 32-bits are sign extended or zero extended, depending on whether
1763  // they are signed or unsigned types.
1764  bool ExtendIntegerRetVal =
1765  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1766 
1767  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1768  bool needTruncate = false;
1769  EVT TheLoadType = VTs[i];
1770  EVT EltType = Ins[i].VT;
1771  Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
1772  if (ExtendIntegerRetVal) {
1773  TheLoadType = MVT::i32;
1774  EltType = MVT::i32;
1775  needTruncate = true;
1776  } else if (TheLoadType.getSizeInBits() < 16) {
1777  if (VTs[i].isInteger())
1778  needTruncate = true;
1779  EltType = MVT::i16;
1780  }
1781 
1782  // Record index of the very first element of the vector.
1783  if (VectorInfo[i] & PVF_FIRST) {
1784  assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1785  VecIdx = i;
1786  }
1787 
1788  LoadVTs.push_back(EltType);
1789 
1790  if (VectorInfo[i] & PVF_LAST) {
1791  unsigned NumElts = LoadVTs.size();
1792  LoadVTs.push_back(MVT::Other);
1793  LoadVTs.push_back(MVT::Glue);
1795  switch (NumElts) {
1796  case 1:
1798  break;
1799  case 2:
1801  break;
1802  case 4:
1804  break;
1805  default:
1806  llvm_unreachable("Invalid vector info.");
1807  }
1808 
1809  SDValue LoadOperands[] = {
1810  Chain, DAG.getConstant(1, dl, MVT::i32),
1811  DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1812  SDValue RetVal = DAG.getMemIntrinsicNode(
1813  Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1814  MachinePointerInfo(), EltAlign,
1816 
1817  for (unsigned j = 0; j < NumElts; ++j) {
1818  ProxyRegOps.push_back(RetVal.getValue(j));
1819 
1820  if (needTruncate)
1821  ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1822  else
1823  ProxyRegTruncates.push_back(Optional<MVT>());
1824  }
1825 
1826  Chain = RetVal.getValue(NumElts);
1827  InFlag = RetVal.getValue(NumElts + 1);
1828 
1829  // Cleanup
1830  VecIdx = -1;
1831  LoadVTs.clear();
1832  }
1833  }
1834  }
1835 
1836  Chain = DAG.getCALLSEQ_END(
1837  Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true),
1838  DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl);
1839  InFlag = Chain.getValue(1);
1840 
1841  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1842  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1843  // dangling.
1844  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1845  SDValue Ret = DAG.getNode(
1846  NVPTXISD::ProxyReg, dl,
1847  DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1848  { Chain, ProxyRegOps[i], InFlag }
1849  );
1850 
1851  Chain = Ret.getValue(1);
1852  InFlag = Ret.getValue(2);
1853 
1854  if (ProxyRegTruncates[i].hasValue()) {
1855  Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1856  }
1857 
1858  InVals.push_back(Ret);
1859  }
1860 
1861  // set isTailCall to false for now, until we figure out how to express
1862  // tail call optimization in PTX
1863  isTailCall = false;
1864  return Chain;
1865 }
1866 
1867 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1868 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1869 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1870 SDValue
1871 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1872  SDNode *Node = Op.getNode();
1873  SDLoc dl(Node);
1875  unsigned NumOperands = Node->getNumOperands();
1876  for (unsigned i = 0; i < NumOperands; ++i) {
1877  SDValue SubOp = Node->getOperand(i);
1878  EVT VVT = SubOp.getNode()->getValueType(0);
1879  EVT EltVT = VVT.getVectorElementType();
1880  unsigned NumSubElem = VVT.getVectorNumElements();
1881  for (unsigned j = 0; j < NumSubElem; ++j) {
1882  Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1883  DAG.getIntPtrConstant(j, dl)));
1884  }
1885  }
1886  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1887 }
1888 
1889 // We can init constant f16x2 with a single .b32 move. Normally it
1890 // would get lowered as two constant loads and vector-packing move.
1891 // mov.b16 %h1, 0x4000;
1892 // mov.b16 %h2, 0x3C00;
1893 // mov.b32 %hh2, {%h2, %h1};
1894 // Instead we want just a constant move:
1895 // mov.b32 %hh2, 0x40003C00
1896 //
1897 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1898 // generates good SASS in both cases.
1899 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1900  SelectionDAG &DAG) const {
1901  //return Op;
1902  if (!(Op->getValueType(0) == MVT::v2f16 &&
1903  isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1904  isa<ConstantFPSDNode>(Op->getOperand(1))))
1905  return Op;
1906 
1907  APInt E0 =
1908  cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1909  APInt E1 =
1910  cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1911  SDValue Const =
1912  DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1913  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1914 }
1915 
1916 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1917  SelectionDAG &DAG) const {
1918  SDValue Index = Op->getOperand(1);
1919  // Constant index will be matched by tablegen.
1920  if (isa<ConstantSDNode>(Index.getNode()))
1921  return Op;
1922 
1923  // Extract individual elements and select one of them.
1924  SDValue Vector = Op->getOperand(0);
1925  EVT VectorVT = Vector.getValueType();
1926  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1927  EVT EltVT = VectorVT.getVectorElementType();
1928 
1929  SDLoc dl(Op.getNode());
1930  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1931  DAG.getIntPtrConstant(0, dl));
1932  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1933  DAG.getIntPtrConstant(1, dl));
1934  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1936 }
1937 
1938 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1939 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1940 /// amount, or
1941 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1942 /// amount.
1943 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1944  SelectionDAG &DAG) const {
1945  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1946  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
1947 
1948  EVT VT = Op.getValueType();
1949  unsigned VTBits = VT.getSizeInBits();
1950  SDLoc dl(Op);
1951  SDValue ShOpLo = Op.getOperand(0);
1952  SDValue ShOpHi = Op.getOperand(1);
1953  SDValue ShAmt = Op.getOperand(2);
1954  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1955 
1956  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1957  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1958  // {dHi, dLo} = {aHi, aLo} >> Amt
1959  // dHi = aHi >> Amt
1960  // dLo = shf.r.clamp aLo, aHi, Amt
1961 
1962  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1963  SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1964  ShAmt);
1965 
1966  SDValue Ops[2] = { Lo, Hi };
1967  return DAG.getMergeValues(Ops, dl);
1968  }
1969  else {
1970  // {dHi, dLo} = {aHi, aLo} >> Amt
1971  // - if (Amt>=size) then
1972  // dLo = aHi >> (Amt-size)
1973  // dHi = aHi >> Amt (this is either all 0 or all 1)
1974  // else
1975  // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1976  // dHi = aHi >> Amt
1977 
1978  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1979  DAG.getConstant(VTBits, dl, MVT::i32),
1980  ShAmt);
1981  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1982  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1983  DAG.getConstant(VTBits, dl, MVT::i32));
1984  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1985  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1986  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
1987 
1988  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1989  DAG.getConstant(VTBits, dl, MVT::i32),
1990  ISD::SETGE);
1991  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1992  SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
1993 
1994  SDValue Ops[2] = { Lo, Hi };
1995  return DAG.getMergeValues(Ops, dl);
1996  }
1997 }
1998 
1999 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2000 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2001 /// amount, or
2002 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2003 /// amount.
2004 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2005  SelectionDAG &DAG) const {
2006  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2007  assert(Op.getOpcode() == ISD::SHL_PARTS);
2008 
2009  EVT VT = Op.getValueType();
2010  unsigned VTBits = VT.getSizeInBits();
2011  SDLoc dl(Op);
2012  SDValue ShOpLo = Op.getOperand(0);
2013  SDValue ShOpHi = Op.getOperand(1);
2014  SDValue ShAmt = Op.getOperand(2);
2015 
2016  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2017  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2018  // {dHi, dLo} = {aHi, aLo} << Amt
2019  // dHi = shf.l.clamp aLo, aHi, Amt
2020  // dLo = aLo << Amt
2021 
2022  SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2023  ShAmt);
2024  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2025 
2026  SDValue Ops[2] = { Lo, Hi };
2027  return DAG.getMergeValues(Ops, dl);
2028  }
2029  else {
2030  // {dHi, dLo} = {aHi, aLo} << Amt
2031  // - if (Amt>=size) then
2032  // dLo = aLo << Amt (all 0)
2033  // dLo = aLo << (Amt-size)
2034  // else
2035  // dLo = aLo << Amt
2036  // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2037 
2038  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2039  DAG.getConstant(VTBits, dl, MVT::i32),
2040  ShAmt);
2041  SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2042  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2043  DAG.getConstant(VTBits, dl, MVT::i32));
2044  SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2045  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2046  SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2047 
2048  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2049  DAG.getConstant(VTBits, dl, MVT::i32),
2050  ISD::SETGE);
2051  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2052  SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2053 
2054  SDValue Ops[2] = { Lo, Hi };
2055  return DAG.getMergeValues(Ops, dl);
2056  }
2057 }
2058 
2059 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2060  EVT VT = Op.getValueType();
2061 
2062  if (VT == MVT::f32)
2063  return LowerFROUND32(Op, DAG);
2064 
2065  if (VT == MVT::f64)
2066  return LowerFROUND64(Op, DAG);
2067 
2068  llvm_unreachable("unhandled type");
2069 }
2070 
2071 // This is the the rounding method used in CUDA libdevice in C like code:
2072 // float roundf(float A)
2073 // {
2074 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2075 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2076 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2077 // }
2078 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2079  SelectionDAG &DAG) const {
2080  SDLoc SL(Op);
2081  SDValue A = Op.getOperand(0);
2082  EVT VT = Op.getValueType();
2083 
2084  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2085 
2086  // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2087  SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2088  const int SignBitMask = 0x80000000;
2089  SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2090  DAG.getConstant(SignBitMask, SL, MVT::i32));
2091  const int PointFiveInBits = 0x3F000000;
2092  SDValue PointFiveWithSignRaw =
2093  DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2094  DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2095  SDValue PointFiveWithSign =
2096  DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2097  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2098  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2099 
2100  // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2101  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2102  SDValue IsLarge =
2103  DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2104  ISD::SETOGT);
2105  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2106 
2107  // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2108  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2109  DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2110  SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2111  return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2112 }
2113 
2114 // The implementation of round(double) is similar to that of round(float) in
2115 // that they both separate the value range into three regions and use a method
2116 // specific to the region to round the values. However, round(double) first
2117 // calculates the round of the absolute value and then adds the sign back while
2118 // round(float) directly rounds the value with sign.
2119 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2120  SelectionDAG &DAG) const {
2121  SDLoc SL(Op);
2122  SDValue A = Op.getOperand(0);
2123  EVT VT = Op.getValueType();
2124 
2125  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2126 
2127  // double RoundedA = (double) (int) (abs(A) + 0.5f);
2128  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2129  DAG.getConstantFP(0.5, SL, VT));
2130  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2131 
2132  // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2133  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2134  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2135  DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2136  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2137  DAG.getConstantFP(0, SL, VT),
2138  RoundedA);
2139 
2140  // Add sign to rounded_A
2141  RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2142  DAG.getNode(ISD::FTRUNC, SL, VT, A);
2143 
2144  // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2145  SDValue IsLarge =
2146  DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2147  ISD::SETOGT);
2148  return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2149 }
2150 
2151 
2152 
2153 SDValue
2155  switch (Op.getOpcode()) {
2156  case ISD::RETURNADDR:
2157  return SDValue();
2158  case ISD::FRAMEADDR:
2159  return SDValue();
2160  case ISD::GlobalAddress:
2161  return LowerGlobalAddress(Op, DAG);
2163  return Op;
2164  case ISD::BUILD_VECTOR:
2165  return LowerBUILD_VECTOR(Op, DAG);
2167  return Op;
2169  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2170  case ISD::CONCAT_VECTORS:
2171  return LowerCONCAT_VECTORS(Op, DAG);
2172  case ISD::STORE:
2173  return LowerSTORE(Op, DAG);
2174  case ISD::LOAD:
2175  return LowerLOAD(Op, DAG);
2176  case ISD::SHL_PARTS:
2177  return LowerShiftLeftParts(Op, DAG);
2178  case ISD::SRA_PARTS:
2179  case ISD::SRL_PARTS:
2180  return LowerShiftRightParts(Op, DAG);
2181  case ISD::SELECT:
2182  return LowerSelect(Op, DAG);
2183  case ISD::FROUND:
2184  return LowerFROUND(Op, DAG);
2185  default:
2186  llvm_unreachable("Custom lowering not defined for operation");
2187  }
2188 }
2189 
2190 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2191  SDValue Op0 = Op->getOperand(0);
2192  SDValue Op1 = Op->getOperand(1);
2193  SDValue Op2 = Op->getOperand(2);
2194  SDLoc DL(Op.getNode());
2195 
2196  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2197 
2198  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2199  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2200  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2201  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2202 
2203  return Trunc;
2204 }
2205 
2206 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2207  if (Op.getValueType() == MVT::i1)
2208  return LowerLOADi1(Op, DAG);
2209 
2210  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2211  // loads and have to handle it here.
2212  if (Op.getValueType() == MVT::v2f16) {
2213  LoadSDNode *Load = cast<LoadSDNode>(Op);
2214  EVT MemVT = Load->getMemoryVT();
2216  MemVT, *Load->getMemOperand())) {
2217  SDValue Ops[2];
2218  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2219  return DAG.getMergeValues(Ops, SDLoc(Op));
2220  }
2221  }
2222 
2223  return SDValue();
2224 }
2225 
2226 // v = ld i1* addr
2227 // =>
2228 // v1 = ld i8* addr (-> i16)
2229 // v = trunc i16 to i1
2230 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2231  SDNode *Node = Op.getNode();
2232  LoadSDNode *LD = cast<LoadSDNode>(Node);
2233  SDLoc dl(Node);
2234  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2235  assert(Node->getValueType(0) == MVT::i1 &&
2236  "Custom lowering for i1 load only");
2237  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2238  LD->getPointerInfo(), LD->getAlignment(),
2239  LD->getMemOperand()->getFlags());
2240  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2241  // The legalizer (the caller) is expecting two values from the legalized
2242  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2243  // in LegalizeDAG.cpp which also uses MergeValues.
2244  SDValue Ops[] = { result, LD->getChain() };
2245  return DAG.getMergeValues(Ops, dl);
2246 }
2247 
2248 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2249  StoreSDNode *Store = cast<StoreSDNode>(Op);
2250  EVT VT = Store->getMemoryVT();
2251 
2252  if (VT == MVT::i1)
2253  return LowerSTOREi1(Op, DAG);
2254 
2255  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2256  // stores and have to handle it here.
2257  if (VT == MVT::v2f16 &&
2259  VT, *Store->getMemOperand()))
2260  return expandUnalignedStore(Store, DAG);
2261 
2262  if (VT.isVector())
2263  return LowerSTOREVector(Op, DAG);
2264 
2265  return SDValue();
2266 }
2267 
2268 SDValue
2269 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2270  SDNode *N = Op.getNode();
2271  SDValue Val = N->getOperand(1);
2272  SDLoc DL(N);
2273  EVT ValVT = Val.getValueType();
2274 
2275  if (ValVT.isVector()) {
2276  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2277  // legal. We can (and should) split that into 2 stores of <2 x double> here
2278  // but I'm leaving that as a TODO for now.
2279  if (!ValVT.isSimple())
2280  return SDValue();
2281  switch (ValVT.getSimpleVT().SimpleTy) {
2282  default:
2283  return SDValue();
2284  case MVT::v2i8:
2285  case MVT::v2i16:
2286  case MVT::v2i32:
2287  case MVT::v2i64:
2288  case MVT::v2f16:
2289  case MVT::v2f32:
2290  case MVT::v2f64:
2291  case MVT::v4i8:
2292  case MVT::v4i16:
2293  case MVT::v4i32:
2294  case MVT::v4f16:
2295  case MVT::v4f32:
2296  case MVT::v8f16: // <4 x f16x2>
2297  // This is a "native" vector type
2298  break;
2299  }
2300 
2301  MemSDNode *MemSD = cast<MemSDNode>(N);
2302  const DataLayout &TD = DAG.getDataLayout();
2303 
2304  Align Alignment = MemSD->getAlign();
2305  Align PrefAlign =
2306  TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2307  if (Alignment < PrefAlign) {
2308  // This store is not sufficiently aligned, so bail out and let this vector
2309  // store be scalarized. Note that we may still be able to emit smaller
2310  // vector stores. For example, if we are storing a <4 x float> with an
2311  // alignment of 8, this check will fail but the legalizer will try again
2312  // with 2 x <2 x float>, which will succeed with an alignment of 8.
2313  return SDValue();
2314  }
2315 
2316  unsigned Opcode = 0;
2317  EVT EltVT = ValVT.getVectorElementType();
2318  unsigned NumElts = ValVT.getVectorNumElements();
2319 
2320  // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2321  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2322  // stored type to i16 and propagate the "real" type as the memory type.
2323  bool NeedExt = false;
2324  if (EltVT.getSizeInBits() < 16)
2325  NeedExt = true;
2326 
2327  bool StoreF16x2 = false;
2328  switch (NumElts) {
2329  default:
2330  return SDValue();
2331  case 2:
2332  Opcode = NVPTXISD::StoreV2;
2333  break;
2334  case 4:
2335  Opcode = NVPTXISD::StoreV4;
2336  break;
2337  case 8:
2338  // v8f16 is a special case. PTX doesn't have st.v8.f16
2339  // instruction. Instead, we split the vector into v2f16 chunks and
2340  // store them with st.v4.b32.
2341  assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2342  Opcode = NVPTXISD::StoreV4;
2343  StoreF16x2 = true;
2344  break;
2345  }
2346 
2348 
2349  // First is the chain
2350  Ops.push_back(N->getOperand(0));
2351 
2352  if (StoreF16x2) {
2353  // Combine f16,f16 -> v2f16
2354  NumElts /= 2;
2355  for (unsigned i = 0; i < NumElts; ++i) {
2357  DAG.getIntPtrConstant(i * 2, DL));
2359  DAG.getIntPtrConstant(i * 2 + 1, DL));
2360  SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2361  Ops.push_back(V2);
2362  }
2363  } else {
2364  // Then the split values
2365  for (unsigned i = 0; i < NumElts; ++i) {
2366  SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2367  DAG.getIntPtrConstant(i, DL));
2368  if (NeedExt)
2369  ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2370  Ops.push_back(ExtVal);
2371  }
2372  }
2373 
2374  // Then any remaining arguments
2375  Ops.append(N->op_begin() + 2, N->op_end());
2376 
2377  SDValue NewSt =
2378  DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2379  MemSD->getMemoryVT(), MemSD->getMemOperand());
2380 
2381  // return DCI.CombineTo(N, NewSt, true);
2382  return NewSt;
2383  }
2384 
2385  return SDValue();
2386 }
2387 
2388 // st i1 v, addr
2389 // =>
2390 // v1 = zxt v to i16
2391 // st.u8 i16, addr
2392 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2393  SDNode *Node = Op.getNode();
2394  SDLoc dl(Node);
2395  StoreSDNode *ST = cast<StoreSDNode>(Node);
2396  SDValue Tmp1 = ST->getChain();
2397  SDValue Tmp2 = ST->getBasePtr();
2398  SDValue Tmp3 = ST->getValue();
2399  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2400  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2401  SDValue Result =
2402  DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2403  ST->getAlignment(), ST->getMemOperand()->getFlags());
2404  return Result;
2405 }
2406 
2407 SDValue
2408 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2409  std::string ParamSym;
2410  raw_string_ostream ParamStr(ParamSym);
2411 
2412  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2413  ParamStr.flush();
2414 
2415  std::string *SavedStr =
2416  nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2417  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2418 }
2419 
2420 // Check to see if the kernel argument is image*_t or sampler_t
2421 
2422 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2423  static const char *const specialTypes[] = { "struct._image2d_t",
2424  "struct._image3d_t",
2425  "struct._sampler_t" };
2426 
2427  Type *Ty = arg->getType();
2428  auto *PTy = dyn_cast<PointerType>(Ty);
2429 
2430  if (!PTy)
2431  return false;
2432 
2433  if (!context)
2434  return false;
2435 
2436  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2437  if (!STy || STy->isLiteral())
2438  return false;
2439 
2440  return llvm::is_contained(specialTypes, STy->getName());
2441 }
2442 
2444  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2445  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2446  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2447  MachineFunction &MF = DAG.getMachineFunction();
2448  const DataLayout &DL = DAG.getDataLayout();
2449  auto PtrVT = getPointerTy(DAG.getDataLayout());
2450 
2451  const Function *F = &MF.getFunction();
2452  const AttributeList &PAL = F->getAttributes();
2453  const TargetLowering *TLI = STI.getTargetLowering();
2454 
2455  SDValue Root = DAG.getRoot();
2456  std::vector<SDValue> OutChains;
2457 
2458  bool isABI = (STI.getSmVersion() >= 20);
2459  assert(isABI && "Non-ABI compilation is not supported");
2460  if (!isABI)
2461  return Chain;
2462 
2463  std::vector<Type *> argTypes;
2464  std::vector<const Argument *> theArgs;
2465  for (const Argument &I : F->args()) {
2466  theArgs.push_back(&I);
2467  argTypes.push_back(I.getType());
2468  }
2469  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2470  // Ins.size() will be larger
2471  // * if there is an aggregate argument with multiple fields (each field
2472  // showing up separately in Ins)
2473  // * if there is a vector argument with more than typical vector-length
2474  // elements (generally if more than 4) where each vector element is
2475  // individually present in Ins.
2476  // So a different index should be used for indexing into Ins.
2477  // See similar issue in LowerCall.
2478  unsigned InsIdx = 0;
2479 
2480  int idx = 0;
2481  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2482  Type *Ty = argTypes[i];
2483 
2484  // If the kernel argument is image*_t or sampler_t, convert it to
2485  // a i32 constant holding the parameter position. This can later
2486  // matched in the AsmPrinter to output the correct mangled name.
2487  if (isImageOrSamplerVal(
2488  theArgs[i],
2489  (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2490  : nullptr))) {
2492  "Only kernels can have image/sampler params");
2493  InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2494  continue;
2495  }
2496 
2497  if (theArgs[i]->use_empty()) {
2498  // argument is dead
2499  if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2500  SmallVector<EVT, 16> vtparts;
2501 
2502  ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2503  assert(vtparts.size() > 0 && "empty aggregate type not expected");
2504  for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2505  ++parti) {
2506  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2507  ++InsIdx;
2508  }
2509  if (vtparts.size() > 0)
2510  --InsIdx;
2511  continue;
2512  }
2513  if (Ty->isVectorTy()) {
2514  EVT ObjectVT = getValueType(DL, Ty);
2515  unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2516  for (unsigned parti = 0; parti < NumRegs; ++parti) {
2517  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2518  ++InsIdx;
2519  }
2520  if (NumRegs > 0)
2521  --InsIdx;
2522  continue;
2523  }
2524  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2525  continue;
2526  }
2527 
2528  // In the following cases, assign a node order of "idx+1"
2529  // to newly created nodes. The SDNodes for params have to
2530  // appear in the same order as their order of appearance
2531  // in the original function. "idx+1" holds that order.
2532  if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2533  bool aggregateIsPacked = false;
2534  if (StructType *STy = dyn_cast<StructType>(Ty))
2535  aggregateIsPacked = STy->isPacked();
2536 
2539  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2540  assert(VTs.size() > 0 && "Unexpected empty type.");
2541  auto VectorInfo =
2542  VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
2543 
2544  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2545  int VecIdx = -1; // Index of the first element of the current vector.
2546  for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2547  if (VectorInfo[parti] & PVF_FIRST) {
2548  assert(VecIdx == -1 && "Orphaned vector.");
2549  VecIdx = parti;
2550  }
2551 
2552  // That's the last element of this store op.
2553  if (VectorInfo[parti] & PVF_LAST) {
2554  unsigned NumElts = parti - VecIdx + 1;
2555  EVT EltVT = VTs[parti];
2556  // i1 is loaded/stored as i8.
2557  EVT LoadVT = EltVT;
2558  if (EltVT == MVT::i1)
2559  LoadVT = MVT::i8;
2560  else if (EltVT == MVT::v2f16)
2561  // getLoad needs a vector type, but it can't handle
2562  // vectors which contain v2f16 elements. So we must load
2563  // using i32 here and then bitcast back.
2564  LoadVT = MVT::i32;
2565 
2566  EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2567  SDValue VecAddr =
2568  DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2569  DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2571  EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2572  SDValue P =
2573  DAG.getLoad(VecVT, dl, Root, VecAddr,
2574  MachinePointerInfo(srcValue), aggregateIsPacked,
2577  if (P.getNode())
2578  P.getNode()->setIROrder(idx + 1);
2579  for (unsigned j = 0; j < NumElts; ++j) {
2580  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2581  DAG.getIntPtrConstant(j, dl));
2582  // We've loaded i1 as an i8 and now must truncate it back to i1
2583  if (EltVT == MVT::i1)
2584  Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2585  // v2f16 was loaded as an i32. Now we must bitcast it back.
2586  else if (EltVT == MVT::v2f16)
2587  Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2588  // Extend the element if necessary (e.g. an i8 is loaded
2589  // into an i16 register)
2590  if (Ins[InsIdx].VT.isInteger() &&
2591  Ins[InsIdx].VT.getFixedSizeInBits() >
2592  LoadVT.getFixedSizeInBits()) {
2593  unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2594  : ISD::ZERO_EXTEND;
2595  Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2596  }
2597  InVals.push_back(Elt);
2598  }
2599 
2600  // Reset vector tracking state.
2601  VecIdx = -1;
2602  }
2603  ++InsIdx;
2604  }
2605  if (VTs.size() > 0)
2606  --InsIdx;
2607  continue;
2608  }
2609 
2610  // Param has ByVal attribute
2611  // Return MoveParam(param symbol).
2612  // Ideally, the param symbol can be returned directly,
2613  // but when SDNode builder decides to use it in a CopyToReg(),
2614  // machine instruction fails because TargetExternalSymbol
2615  // (not lowered) is target dependent, and CopyToReg assumes
2616  // the source is lowered.
2617  EVT ObjectVT = getValueType(DL, Ty);
2618  assert(ObjectVT == Ins[InsIdx].VT &&
2619  "Ins type did not match function type");
2620  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2621  SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2622  if (p.getNode())
2623  p.getNode()->setIROrder(idx + 1);
2624  InVals.push_back(p);
2625  }
2626 
2627  // Clang will check explicit VarArg and issue error if any. However, Clang
2628  // will let code with
2629  // implicit var arg like f() pass. See bug 617733.
2630  // We treat this case as if the arg list is empty.
2631  // if (F.isVarArg()) {
2632  // assert(0 && "VarArg not supported yet!");
2633  //}
2634 
2635  if (!OutChains.empty())
2636  DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2637 
2638  return Chain;
2639 }
2640 
2641 SDValue
2643  bool isVarArg,
2644  const SmallVectorImpl<ISD::OutputArg> &Outs,
2645  const SmallVectorImpl<SDValue> &OutVals,
2646  const SDLoc &dl, SelectionDAG &DAG) const {
2647  MachineFunction &MF = DAG.getMachineFunction();
2648  Type *RetTy = MF.getFunction().getReturnType();
2649 
2650  bool isABI = (STI.getSmVersion() >= 20);
2651  assert(isABI && "Non-ABI compilation is not supported");
2652  if (!isABI)
2653  return Chain;
2654 
2655  const DataLayout DL = DAG.getDataLayout();
2658  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2659  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2660 
2661  auto VectorInfo = VectorizePTXValueVTs(
2662  VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
2663 
2664  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2665  // 32-bits are sign extended or zero extended, depending on whether
2666  // they are signed or unsigned types.
2667  bool ExtendIntegerRetVal =
2668  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2669 
2670  SmallVector<SDValue, 6> StoreOperands;
2671  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2672  // New load/store. Record chain and offset operands.
2673  if (VectorInfo[i] & PVF_FIRST) {
2674  assert(StoreOperands.empty() && "Orphaned operand list.");
2675  StoreOperands.push_back(Chain);
2676  StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2677  }
2678 
2679  SDValue RetVal = OutVals[i];
2680  if (ExtendIntegerRetVal) {
2681  RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2682  : ISD::ZERO_EXTEND,
2683  dl, MVT::i32, RetVal);
2684  } else if (RetVal.getValueSizeInBits() < 16) {
2685  // Use 16-bit registers for small load-stores as it's the
2686  // smallest general purpose register size supported by NVPTX.
2687  RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2688  }
2689 
2690  // Record the value to return.
2691  StoreOperands.push_back(RetVal);
2692 
2693  // That's the last element of this store op.
2694  if (VectorInfo[i] & PVF_LAST) {
2696  unsigned NumElts = StoreOperands.size() - 2;
2697  switch (NumElts) {
2698  case 1:
2700  break;
2701  case 2:
2703  break;
2704  case 4:
2706  break;
2707  default:
2708  llvm_unreachable("Invalid vector info.");
2709  }
2710 
2711  // Adjust type of load/store op if we've extended the scalar
2712  // return value.
2713  EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2714  Chain = DAG.getMemIntrinsicNode(
2715  Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
2717  // Cleanup vector state.
2718  StoreOperands.clear();
2719  }
2720  }
2721 
2722  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2723 }
2724 
2726  SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2727  SelectionDAG &DAG) const {
2728  if (Constraint.length() > 1)
2729  return;
2730  else
2731  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2732 }
2733 
2734 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2735  switch (Intrinsic) {
2736  default:
2737  return 0;
2738 
2739  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2740  return NVPTXISD::Tex1DFloatS32;
2741  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2743  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2745  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2747  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2748  return NVPTXISD::Tex1DS32S32;
2749  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2750  return NVPTXISD::Tex1DS32Float;
2751  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2753  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2755  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2756  return NVPTXISD::Tex1DU32S32;
2757  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2758  return NVPTXISD::Tex1DU32Float;
2759  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2761  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2763 
2764  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2766  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2768  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2770  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2772  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2774  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2776  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2778  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2780  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2782  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2784  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2786  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2788 
2789  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2790  return NVPTXISD::Tex2DFloatS32;
2791  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2793  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2795  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2797  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2798  return NVPTXISD::Tex2DS32S32;
2799  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2800  return NVPTXISD::Tex2DS32Float;
2801  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2803  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2805  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2806  return NVPTXISD::Tex2DU32S32;
2807  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2808  return NVPTXISD::Tex2DU32Float;
2809  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2811  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2813 
2814  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2816  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2818  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2820  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2822  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2824  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2826  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2828  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2830  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2832  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2834  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2836  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2838 
2839  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2840  return NVPTXISD::Tex3DFloatS32;
2841  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2843  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2845  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2847  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2848  return NVPTXISD::Tex3DS32S32;
2849  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2850  return NVPTXISD::Tex3DS32Float;
2851  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2853  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2855  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2856  return NVPTXISD::Tex3DU32S32;
2857  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2858  return NVPTXISD::Tex3DU32Float;
2859  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2861  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2863 
2864  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2866  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2868  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2870  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2872  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2874  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2876 
2877  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2879  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2881  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2883  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2885  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2887  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2889 
2890  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2892  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2894  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2896  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2898  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2900  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2902  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2904  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2906  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2908  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2910  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2912  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2914 
2915  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2917  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2919  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2921  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2923  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2925  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2927  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2929  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2931  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2933  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2935  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2937  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2939 
2940  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2942  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2944  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2946  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2948  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2950  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2952  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2954  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2956  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2958  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2960  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2962  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2964 
2965  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2967  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2969  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2971  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2973  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2975  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2977  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2979  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2981  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2983  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2985  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2987  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2989 
2990  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2992  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2994  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2996  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2998  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3000  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3002  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3004  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3006  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3008  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3010  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3012  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3014 
3015  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3017  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3019  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3021  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3023  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3025  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3027  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3029  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3031  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3033  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3035  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3037  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3039 
3040  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3042  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3044  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3046  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3048  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3050  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3052 
3053  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3055  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3057  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3059  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3061  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3063  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3065 
3066  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3068  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3070  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3072  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3074  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3076  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3078  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3080  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3082  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3084  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3086  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3088  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3090  }
3091 }
3092 
3093 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3094  switch (Intrinsic) {
3095  default:
3096  return 0;
3097  case Intrinsic::nvvm_suld_1d_i8_clamp:
3098  return NVPTXISD::Suld1DI8Clamp;
3099  case Intrinsic::nvvm_suld_1d_i16_clamp:
3100  return NVPTXISD::Suld1DI16Clamp;
3101  case Intrinsic::nvvm_suld_1d_i32_clamp:
3102  return NVPTXISD::Suld1DI32Clamp;
3103  case Intrinsic::nvvm_suld_1d_i64_clamp:
3104  return NVPTXISD::Suld1DI64Clamp;
3105  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3107  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3109  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3111  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3113  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3115  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3117  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3119  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3121  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3123  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3125  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3127  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3129  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3131  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3133  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3135  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3137  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3139  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3141  case Intrinsic::nvvm_suld_2d_i8_clamp:
3142  return NVPTXISD::Suld2DI8Clamp;
3143  case Intrinsic::nvvm_suld_2d_i16_clamp:
3144  return NVPTXISD::Suld2DI16Clamp;
3145  case Intrinsic::nvvm_suld_2d_i32_clamp:
3146  return NVPTXISD::Suld2DI32Clamp;
3147  case Intrinsic::nvvm_suld_2d_i64_clamp:
3148  return NVPTXISD::Suld2DI64Clamp;
3149  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3151  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3153  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3155  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3157  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3159  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3161  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3163  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3165  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3167  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3169  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3171  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3173  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3175  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3177  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3179  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3181  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3183  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3185  case Intrinsic::nvvm_suld_3d_i8_clamp:
3186  return NVPTXISD::Suld3DI8Clamp;
3187  case Intrinsic::nvvm_suld_3d_i16_clamp:
3188  return NVPTXISD::Suld3DI16Clamp;
3189  case Intrinsic::nvvm_suld_3d_i32_clamp:
3190  return NVPTXISD::Suld3DI32Clamp;
3191  case Intrinsic::nvvm_suld_3d_i64_clamp:
3192  return NVPTXISD::Suld3DI64Clamp;
3193  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3195  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3197  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3199  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3201  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3203  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3205  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3207  case Intrinsic::nvvm_suld_1d_i8_trap:
3208  return NVPTXISD::Suld1DI8Trap;
3209  case Intrinsic::nvvm_suld_1d_i16_trap:
3210  return NVPTXISD::Suld1DI16Trap;
3211  case Intrinsic::nvvm_suld_1d_i32_trap:
3212  return NVPTXISD::Suld1DI32Trap;
3213  case Intrinsic::nvvm_suld_1d_i64_trap:
3214  return NVPTXISD::Suld1DI64Trap;
3215  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3216  return NVPTXISD::Suld1DV2I8Trap;
3217  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3219  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3221  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3223  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3224  return NVPTXISD::Suld1DV4I8Trap;
3225  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3227  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3229  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3231  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3233  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3235  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3237  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3239  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3241  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3243  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3245  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3247  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3249  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3251  case Intrinsic::nvvm_suld_2d_i8_trap:
3252  return NVPTXISD::Suld2DI8Trap;
3253  case Intrinsic::nvvm_suld_2d_i16_trap:
3254  return NVPTXISD::Suld2DI16Trap;
3255  case Intrinsic::nvvm_suld_2d_i32_trap:
3256  return NVPTXISD::Suld2DI32Trap;
3257  case Intrinsic::nvvm_suld_2d_i64_trap:
3258  return NVPTXISD::Suld2DI64Trap;
3259  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3260  return NVPTXISD::Suld2DV2I8Trap;
3261  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3263  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3265  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3267  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3268  return NVPTXISD::Suld2DV4I8Trap;
3269  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3271  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3273  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3275  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3277  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3279  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3281  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3283  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3285  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3287  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3289  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3291  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3293  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3295  case Intrinsic::nvvm_suld_3d_i8_trap:
3296  return NVPTXISD::Suld3DI8Trap;
3297  case Intrinsic::nvvm_suld_3d_i16_trap:
3298  return NVPTXISD::Suld3DI16Trap;
3299  case Intrinsic::nvvm_suld_3d_i32_trap:
3300  return NVPTXISD::Suld3DI32Trap;
3301  case Intrinsic::nvvm_suld_3d_i64_trap:
3302  return NVPTXISD::Suld3DI64Trap;
3303  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3304  return NVPTXISD::Suld3DV2I8Trap;
3305  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3307  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3309  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3311  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3312  return NVPTXISD::Suld3DV4I8Trap;
3313  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3315  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3317  case Intrinsic::nvvm_suld_1d_i8_zero:
3318  return NVPTXISD::Suld1DI8Zero;
3319  case Intrinsic::nvvm_suld_1d_i16_zero:
3320  return NVPTXISD::Suld1DI16Zero;
3321  case Intrinsic::nvvm_suld_1d_i32_zero:
3322  return NVPTXISD::Suld1DI32Zero;
3323  case Intrinsic::nvvm_suld_1d_i64_zero:
3324  return NVPTXISD::Suld1DI64Zero;
3325  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3326  return NVPTXISD::Suld1DV2I8Zero;
3327  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3329  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3331  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3333  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3334  return NVPTXISD::Suld1DV4I8Zero;
3335  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3337  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3339  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3341  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3343  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3345  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3347  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3349  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3351  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3353  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3355  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3357  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3359  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3361  case Intrinsic::nvvm_suld_2d_i8_zero:
3362  return NVPTXISD::Suld2DI8Zero;
3363  case Intrinsic::nvvm_suld_2d_i16_zero:
3364  return NVPTXISD::Suld2DI16Zero;
3365  case Intrinsic::nvvm_suld_2d_i32_zero:
3366  return NVPTXISD::Suld2DI32Zero;
3367  case Intrinsic::nvvm_suld_2d_i64_zero:
3368  return NVPTXISD::Suld2DI64Zero;
3369  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3370  return NVPTXISD::Suld2DV2I8Zero;
3371  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3373  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3375  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3377  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3378  return NVPTXISD::Suld2DV4I8Zero;
3379  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3381  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3383  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3385  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3387  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3389  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3391  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3393  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3395  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3397  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3399  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3401  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3403  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3405  case Intrinsic::nvvm_suld_3d_i8_zero:
3406  return NVPTXISD::Suld3DI8Zero;
3407  case Intrinsic::nvvm_suld_3d_i16_zero:
3408  return NVPTXISD::Suld3DI16Zero;
3409  case Intrinsic::nvvm_suld_3d_i32_zero:
3410  return NVPTXISD::Suld3DI32Zero;
3411  case Intrinsic::nvvm_suld_3d_i64_zero:
3412  return NVPTXISD::Suld3DI64Zero;
3413  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3414  return NVPTXISD::Suld3DV2I8Zero;
3415  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3417  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3419  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3421  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3422  return NVPTXISD::Suld3DV4I8Zero;
3423  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3425  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3427  }
3428 }
3429 
3430 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3431 // TgtMemIntrinsic
3432 // because we need the information that is only available in the "Value" type
3433 // of destination
3434 // pointer. In particular, the address space information.
3436  IntrinsicInfo &Info, const CallInst &I,
3437  MachineFunction &MF, unsigned Intrinsic) const {
3438  switch (Intrinsic) {
3439  default:
3440  return false;
3441  case Intrinsic::nvvm_match_all_sync_i32p:
3442  case Intrinsic::nvvm_match_all_sync_i64p:
3444  // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3445  // in order to model data exchange with other threads, but perform no real
3446  // memory accesses.
3447  Info.memVT = MVT::i1;
3448 
3449  // Our result depends on both our and other thread's arguments.
3451  return true;
3452  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3453  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3454  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3455  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3456  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3457  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3458  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3459  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3460  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3461  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3462  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3463  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3464  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3465  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3466  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3467  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3468  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3469  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3470  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3471  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3472  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3473  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3474  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3475  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3477  Info.memVT = MVT::v8f16;
3478  Info.ptrVal = I.getArgOperand(0);
3479  Info.offset = 0;
3481  Info.align = Align(16);
3482  return true;
3483  }
3484  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3485  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3486  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3487  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3488  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3489  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3490  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3491  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3492  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3493  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3494  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3495  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3496  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3497  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3498  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3499  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: {
3501  Info.memVT = MVT::v2i32;
3502  Info.ptrVal = I.getArgOperand(0);
3503  Info.offset = 0;
3505  Info.align = Align(8);
3506  return true;
3507  }
3508 
3509  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3510  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3511  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3512  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3513  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3514  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3515  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3516  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3517 
3518  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3519  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3520  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3521  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3522  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3523  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3524  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3525  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: {
3527  Info.memVT = MVT::v4i32;
3528  Info.ptrVal = I.getArgOperand(0);
3529  Info.offset = 0;
3531  Info.align = Align(16);
3532  return true;
3533  }
3534 
3535  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3536  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3537  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3538  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3539  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3540  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3541  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3542  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3543 
3544  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3545  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3546  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3547  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3548  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3549  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3550  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3551  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3552  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3553  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3554  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3555  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3556  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3557  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3558  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3559  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3560  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3561  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3562  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3563  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: {
3565  Info.memVT = MVT::i32;
3566  Info.ptrVal = I.getArgOperand(0);
3567  Info.offset = 0;
3569  Info.align = Align(4);
3570  return true;
3571  }
3572 
3573  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3574  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3575  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3576  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3577  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3578  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3579  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3580  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3581  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3582  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3583  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3584  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3586  Info.memVT = MVT::v4f16;
3587  Info.ptrVal = I.getArgOperand(0);
3588  Info.offset = 0;
3590  Info.align = Align(16);
3591  return true;
3592  }
3593 
3594  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3595  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3596  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3597  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3598  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3599  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3600  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3601  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3602  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3603  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3604  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3605  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3607  Info.memVT = MVT::v8f32;
3608  Info.ptrVal = I.getArgOperand(0);
3609  Info.offset = 0;
3611  Info.align = Align(16);
3612  return true;
3613  }
3614 
3615  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3616  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3617  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3618  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3619  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3620  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3621  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3622  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3623  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3624  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3625  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3626  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3628  Info.memVT = MVT::v8i32;
3629  Info.ptrVal = I.getArgOperand(0);
3630  Info.offset = 0;
3632  Info.align = Align(16);
3633  return true;
3634  }
3635 
3636  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3637  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3638  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3639  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3640  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3641  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3642  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3643  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: {
3645  Info.memVT = MVT::v2i32;
3646  Info.ptrVal = I.getArgOperand(0);
3647  Info.offset = 0;
3649  Info.align = Align(8);
3650  return true;
3651  }
3652 
3653  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3654  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3655  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3656  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3657  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3658  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3659  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3660  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3661  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3662  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3663  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3664  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3665  Info.opc = ISD::INTRINSIC_VOID;
3666  Info.memVT = MVT::v4f16;
3667  Info.ptrVal = I.getArgOperand(0);
3668  Info.offset = 0;
3670  Info.align = Align(16);
3671  return true;
3672  }
3673 
3674  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3675  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3676  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3677  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3678  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3679  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3680  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3681  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3682  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3683  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3684  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3685  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3686  Info.opc = ISD::INTRINSIC_VOID;
3687  Info.memVT = MVT::v8f32;
3688  Info.ptrVal = I.getArgOperand(0);
3689  Info.offset = 0;
3691  Info.align = Align(16);
3692  return true;
3693  }
3694 
3695  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3696  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3697  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3698  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3699  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3700  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3701  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3702  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3703  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3704  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3705  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3706  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3707  Info.opc = ISD::INTRINSIC_VOID;
3708  Info.memVT = MVT::v8i32;
3709  Info.ptrVal = I.getArgOperand(0);
3710  Info.offset = 0;
3712  Info.align = Align(16);
3713  return true;
3714  }
3715 
3716  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3717  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3718  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3719  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3720  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3721  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3722  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3723  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3724  Info.opc = ISD::INTRINSIC_VOID;
3725  Info.memVT = MVT::v2i32;
3726  Info.ptrVal = I.getArgOperand(0);
3727  Info.offset = 0;
3729  Info.align = Align(8);
3730  return true;
3731  }
3732 
3733  case Intrinsic::nvvm_atomic_load_inc_32:
3734  case Intrinsic::nvvm_atomic_load_dec_32:
3735 
3736  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3737  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3738  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3739  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3740  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3741  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3742  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3743  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3744  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3745  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3746  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3747  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3748  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3749  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3750  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3751  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3752  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3753  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3754  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3755  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3756  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3757  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3758  auto &DL = I.getModule()->getDataLayout();
3760  Info.memVT = getValueType(DL, I.getType());
3761  Info.ptrVal = I.getArgOperand(0);
3762  Info.offset = 0;
3764  Info.align.reset();
3765  return true;
3766  }
3767 
3768  case Intrinsic::nvvm_ldu_global_i:
3769  case Intrinsic::nvvm_ldu_global_f:
3770  case Intrinsic::nvvm_ldu_global_p: {
3771  auto &DL = I.getModule()->getDataLayout();
3773  if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3774  Info.memVT = getValueType(DL, I.getType());
3775  else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3776  Info.memVT = getPointerTy(DL);
3777  else
3778  Info.memVT = getValueType(DL, I.getType());
3779  Info.ptrVal = I.getArgOperand(0);
3780  Info.offset = 0;
3782  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
3783 
3784  return true;
3785  }
3786  case Intrinsic::nvvm_ldg_global_i:
3787  case Intrinsic::nvvm_ldg_global_f:
3788  case Intrinsic::nvvm_ldg_global_p: {
3789  auto &DL = I.getModule()->getDataLayout();
3790 
3792  if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3793  Info.memVT = getValueType(DL, I.getType());
3794  else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3795  Info.memVT = getPointerTy(DL);
3796  else
3797  Info.memVT = getValueType(DL, I.getType());
3798  Info.ptrVal = I.getArgOperand(0);
3799  Info.offset = 0;
3801  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
3802 
3803  return true;
3804  }
3805 
3806  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3807  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3808  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3809  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3810  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3811  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3812  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3813  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3814  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3815  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3816  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3817  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3818  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3819  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3820  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3821  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3822  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3823  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3824  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3825  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3826  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3827  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3828  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3829  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3830  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3831  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3832  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3833  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3834  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3835  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3836  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3837  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3838  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3839  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3840  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3841  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3842  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3843  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3844  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3845  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3846  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3847  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3848  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3849  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3850  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3851  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3852  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3853  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3854  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3855  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3856  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3857  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3858  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3859  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3860  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3861  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3862  Info.opc = getOpcForTextureInstr(Intrinsic);
3863  Info.memVT = MVT::v4f32;
3864  Info.ptrVal = nullptr;
3865  Info.offset = 0;
3867  Info.align = Align(16);
3868  return true;
3869 
3870  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3871  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3872  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3873  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3874  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3875  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3876  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3877  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3878  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3879  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3880  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3881  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3882  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3883  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3884  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3885  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3886  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3887  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3888  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3889  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3890  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3891  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3892  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3893  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3894  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3895  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3896  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3897  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3898  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3899  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3900  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3901  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3902  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3903  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3904  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3905  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3906  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3907  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3908  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3909  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3910  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3911  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3912  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3913  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3914  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3915  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3916  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3917  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3918  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3919  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3920  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3921  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3922  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3923  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3924  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3925  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3926  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3927  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3928  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3929  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3930  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3931  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3932  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3933  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3934  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3935  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3936  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3937  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3938  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3939  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3940  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3941  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3942  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3943  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3944  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3945  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3946  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3947  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3948  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3949  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3950  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3951  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3952  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3953  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3954  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3955  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3956  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3957  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3958  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3959  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3960  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3961  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3962  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3963  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3964  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3965  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3966  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3967  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3968  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3969  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3970  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3971  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3972  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3973  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3974  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3975  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3976  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3977  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3978  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3979  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3980  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3981  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3982  Info.opc = getOpcForTextureInstr(Intrinsic);
3983  Info.memVT = MVT::v4i32;
3984  Info.ptrVal = nullptr;
3985  Info.offset = 0;
3987  Info.align = Align(16);
3988  return true;
3989 
3990  case Intrinsic::nvvm_suld_1d_i8_clamp:
3991  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3992  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3993  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3994  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3995  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3996  case Intrinsic::nvvm_suld_2d_i8_clamp:
3997  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3998  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3999  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4000  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4001  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4002  case Intrinsic::nvvm_suld_3d_i8_clamp:
4003  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4004  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4005  case Intrinsic::nvvm_suld_1d_i8_trap:
4006  case Intrinsic::nvvm_suld_1d_v2i8_trap:
4007  case Intrinsic::nvvm_suld_1d_v4i8_trap:
4008  case Intrinsic::nvvm_suld_1d_array_i8_trap:
4009  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4010  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4011  case Intrinsic::nvvm_suld_2d_i8_trap:
4012  case Intrinsic::nvvm_suld_2d_v2i8_trap:
4013  case Intrinsic::nvvm_suld_2d_v4i8_trap:
4014  case Intrinsic::nvvm_suld_2d_array_i8_trap:
4015  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4016  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4017  case Intrinsic::nvvm_suld_3d_i8_trap:
4018  case Intrinsic::nvvm_suld_3d_v2i8_trap:
4019  case Intrinsic::nvvm_suld_3d_v4i8_trap:
4020  case Intrinsic::nvvm_suld_1d_i8_zero:
4021  case Intrinsic::nvvm_suld_1d_v2i8_zero:
4022  case Intrinsic::nvvm_suld_1d_v4i8_zero:
4023  case Intrinsic::nvvm_suld_1d_array_i8_zero:
4024  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4025  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4026  case Intrinsic::nvvm_suld_2d_i8_zero:
4027  case Intrinsic::nvvm_suld_2d_v2i8_zero:
4028  case Intrinsic::nvvm_suld_2d_v4i8_zero:
4029  case Intrinsic::nvvm_suld_2d_array_i8_zero:
4030  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4031  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4032  case Intrinsic::nvvm_suld_3d_i8_zero:
4033  case Intrinsic::nvvm_suld_3d_v2i8_zero:
4034  case Intrinsic::nvvm_suld_3d_v4i8_zero:
4035  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4036  Info.memVT = MVT::i8;
4037  Info.ptrVal = nullptr;
4038  Info.offset = 0;
4040  Info.align = Align(16);
4041  return true;
4042 
4043  case Intrinsic::nvvm_suld_1d_i16_clamp:
4044  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4045  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4046  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4047  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4048  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4049  case Intrinsic::nvvm_suld_2d_i16_clamp:
4050  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4051  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4052  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4053  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4054  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4055  case Intrinsic::nvvm_suld_3d_i16_clamp:
4056  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4057  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4058  case Intrinsic::nvvm_suld_1d_i16_trap:
4059  case Intrinsic::nvvm_suld_1d_v2i16_trap:
4060  case Intrinsic::nvvm_suld_1d_v4i16_trap:
4061  case Intrinsic::nvvm_suld_1d_array_i16_trap:
4062  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4063  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4064  case Intrinsic::nvvm_suld_2d_i16_trap:
4065  case Intrinsic::nvvm_suld_2d_v2i16_trap:
4066  case Intrinsic::nvvm_suld_2d_v4i16_trap:
4067  case Intrinsic::nvvm_suld_2d_array_i16_trap:
4068  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4069  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4070  case Intrinsic::nvvm_suld_3d_i16_trap:
4071  case Intrinsic::nvvm_suld_3d_v2i16_trap:
4072  case Intrinsic::nvvm_suld_3d_v4i16_trap:
4073  case Intrinsic::nvvm_suld_1d_i16_zero:
4074  case Intrinsic::nvvm_suld_1d_v2i16_zero:
4075  case Intrinsic::nvvm_suld_1d_v4i16_zero:
4076  case Intrinsic::nvvm_suld_1d_array_i16_zero:
4077  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4078  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4079  case Intrinsic::nvvm_suld_2d_i16_zero:
4080  case Intrinsic::nvvm_suld_2d_v2i16_zero:
4081  case Intrinsic::nvvm_suld_2d_v4i16_zero:
4082  case Intrinsic::nvvm_suld_2d_array_i16_zero:
4083  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4084  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4085  case Intrinsic::nvvm_suld_3d_i16_zero:
4086  case Intrinsic::nvvm_suld_3d_v2i16_zero:
4087  case Intrinsic::nvvm_suld_3d_v4i16_zero:
4088  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4089  Info.memVT = MVT::i16;
4090  Info.ptrVal = nullptr;
4091  Info.offset = 0;
4093  Info.align = Align(16);
4094  return true;
4095 
4096  case Intrinsic::nvvm_suld_1d_i32_clamp:
4097  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4098  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4099  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4100  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4101  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4102  case Intrinsic::nvvm_suld_2d_i32_clamp:
4103  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4104  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4105  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4106  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4107  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4108  case Intrinsic::nvvm_suld_3d_i32_clamp:
4109  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4110  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4111  case Intrinsic::nvvm_suld_1d_i32_trap:
4112  case Intrinsic::nvvm_suld_1d_v2i32_trap:
4113  case Intrinsic::nvvm_suld_1d_v4i32_trap:
4114  case Intrinsic::nvvm_suld_1d_array_i32_trap:
4115  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4116  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4117  case Intrinsic::nvvm_suld_2d_i32_trap:
4118  case Intrinsic::nvvm_suld_2d_v2i32_trap:
4119  case Intrinsic::nvvm_suld_2d_v4i32_trap:
4120  case Intrinsic::nvvm_suld_2d_array_i32_trap:
4121  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4122  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4123  case Intrinsic::nvvm_suld_3d_i32_trap:
4124  case Intrinsic::nvvm_suld_3d_v2i32_trap:
4125  case Intrinsic::nvvm_suld_3d_v4i32_trap:
4126  case Intrinsic::nvvm_suld_1d_i32_zero:
4127  case Intrinsic::nvvm_suld_1d_v2i32_zero:
4128  case Intrinsic::nvvm_suld_1d_v4i32_zero:
4129  case Intrinsic::nvvm_suld_1d_array_i32_zero:
4130  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4131  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4132  case Intrinsic::nvvm_suld_2d_i32_zero:
4133  case Intrinsic::nvvm_suld_2d_v2i32_zero:
4134  case Intrinsic::nvvm_suld_2d_v4i32_zero:
4135  case Intrinsic::nvvm_suld_2d_array_i32_zero:
4136  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4137  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4138  case Intrinsic::nvvm_suld_3d_i32_zero:
4139  case Intrinsic::nvvm_suld_3d_v2i32_zero:
4140  case Intrinsic::nvvm_suld_3d_v4i32_zero:
4141  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4142  Info.memVT = MVT::i32;
4143  Info.ptrVal = nullptr;
4144  Info.offset = 0;
4146  Info.align = Align(16);
4147  return true;
4148 
4149  case Intrinsic::nvvm_suld_1d_i64_clamp:
4150  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4151  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4152  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4153  case Intrinsic::nvvm_suld_2d_i64_clamp:
4154  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4155  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4156  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4157  case Intrinsic::nvvm_suld_3d_i64_clamp:
4158  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4159  case Intrinsic::nvvm_suld_1d_i64_trap:
4160  case Intrinsic::nvvm_suld_1d_v2i64_trap:
4161  case Intrinsic::nvvm_suld_1d_array_i64_trap:
4162  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4163  case Intrinsic::nvvm_suld_2d_i64_trap:
4164  case Intrinsic::nvvm_suld_2d_v2i64_trap:
4165  case Intrinsic::nvvm_suld_2d_array_i64_trap:
4166  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4167  case Intrinsic::nvvm_suld_3d_i64_trap:
4168  case Intrinsic::nvvm_suld_3d_v2i64_trap:
4169  case Intrinsic::nvvm_suld_1d_i64_zero:
4170  case Intrinsic::nvvm_suld_1d_v2i64_zero:
4171  case Intrinsic::nvvm_suld_1d_array_i64_zero:
4172  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4173  case Intrinsic::nvvm_suld_2d_i64_zero:
4174  case Intrinsic::nvvm_suld_2d_v2i64_zero:
4175  case Intrinsic::nvvm_suld_2d_array_i64_zero:
4176  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4177  case Intrinsic::nvvm_suld_3d_i64_zero:
4178  case Intrinsic::nvvm_suld_3d_v2i64_zero:
4179  Info.opc = getOpcForSurfaceInstr(Intrinsic);
4180  Info.memVT = MVT::i64;
4181  Info.ptrVal = nullptr;
4182  Info.offset = 0;
4184  Info.align = Align(16);
4185  return true;
4186  }
4187  return false;
4188 }
4189 
4190 /// isLegalAddressingMode - Return true if the addressing mode represented
4191 /// by AM is legal for this target, for a load/store of the specified type.
4192 /// Used to guide target specific optimizations, like loop strength reduction
4193 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4194 /// (CodeGenPrepare.cpp)
4196  const AddrMode &AM, Type *Ty,
4197  unsigned AS, Instruction *I) const {
4198  // AddrMode - This represents an addressing mode of:
4199  // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4200  //
4201  // The legal address modes are
4202  // - [avar]
4203  // - [areg]
4204  // - [areg+immoff]
4205  // - [immAddr]
4206 
4207  if (AM.BaseGV) {
4208  return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4209  }
4210 
4211  switch (AM.Scale) {
4212  case 0: // "r", "r+i" or "i" is allowed
4213  break;
4214  case 1:
4215  if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4216  return false;
4217  // Otherwise we have r+i.
4218  break;
4219  default:
4220  // No scale > 1 is allowed
4221  return false;
4222  }
4223  return true;
4224 }
4225 
4226 //===----------------------------------------------------------------------===//
4227 // NVPTX Inline Assembly Support
4228 //===----------------------------------------------------------------------===//
4229 
4230 /// getConstraintType - Given a constraint letter, return the type of
4231 /// constraint it is for this target.
4234  if (Constraint.size() == 1) {
4235  switch (Constraint[0]) {
4236  default:
4237  break;
4238  case 'b':
4239  case 'r':
4240  case 'h':
4241  case 'c':
4242  case 'l':
4243  case 'f':
4244  case 'd':
4245  case '0':
4246  case 'N':
4247  return C_RegisterClass;
4248  }
4249  }
4250  return TargetLowering::getConstraintType(Constraint);
4251 }
4252 
4253 std::pair<unsigned, const TargetRegisterClass *>
4255  StringRef Constraint,
4256  MVT VT) const {
4257  if (Constraint.size() == 1) {
4258  switch (Constraint[0]) {
4259  case 'b':
4260  return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4261  case 'c':
4262  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4263  case 'h':
4264  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4265  case 'r':
4266  return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4267  case 'l':
4268  case 'N':
4269  return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4270  case 'f':
4271  return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4272  case 'd':
4273  return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4274  }
4275  }
4276  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4277 }
4278 
4279 //===----------------------------------------------------------------------===//
4280 // NVPTX DAG Combining
4281 //===----------------------------------------------------------------------===//
4282 
4284  CodeGenOpt::Level OptLevel) const {
4285  // Always honor command-line argument
4286  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4287  return FMAContractLevelOpt > 0;
4288 
4289  // Do not contract if we're not optimizing the code.
4290  if (OptLevel == 0)
4291  return false;
4292 
4293  // Honor TargetOptions flags that explicitly say fusion is okay.
4295  return true;
4296 
4297  return allowUnsafeFPMath(MF);
4298 }
4299 
4301  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4302  if (MF.getTarget().Options.UnsafeFPMath)
4303  return true;
4304 
4305  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4306  const Function &F = MF.getFunction();
4307  return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
4308 }
4309 
4310 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4311 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4312 /// called with the default operands, and if that fails, with commuted
4313 /// operands.
4316  const NVPTXSubtarget &Subtarget,
4317  CodeGenOpt::Level OptLevel) {
4318  SelectionDAG &DAG = DCI.DAG;
4319  // Skip non-integer, non-scalar case
4320  EVT VT=N0.getValueType();
4321  if (VT.isVector())
4322  return SDValue();
4323 
4324  // fold (add (mul a, b), c) -> (mad a, b, c)
4325  //
4326  if (N0.getOpcode() == ISD::MUL) {
4327  assert (VT.isInteger());
4328  // For integer:
4329  // Since integer multiply-add costs the same as integer multiply
4330  // but is more costly than integer add, do the fusion only when
4331  // the mul is only used in the add.
4332  if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4333  !N0.getNode()->hasOneUse())
4334  return SDValue();
4335 
4336  // Do the folding
4337  return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4338  N0.getOperand(0), N0.getOperand(1), N1);
4339  }
4340  else if (N0.getOpcode() == ISD::FMUL) {
4341  if (VT == MVT::f32 || VT == MVT::f64) {
4342  const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4343  &DAG.getTargetLoweringInfo());
4344  if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4345  return SDValue();
4346 
4347  // For floating point:
4348  // Do the fusion only when the mul has less than 5 uses and all
4349  // are add.
4350  // The heuristic is that if a use is not an add, then that use
4351  // cannot be fused into fma, therefore mul is still needed anyway.
4352  // If there are more than 4 uses, even if they are all add, fusing
4353  // them will increase register pressue.
4354  //
4355  int numUses = 0;
4356  int nonAddCount = 0;
4357  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4358  UE = N0.getNode()->use_end();
4359  UI != UE; ++UI) {
4360  numUses++;
4361  SDNode *User = *UI;
4362  if (User->getOpcode() != ISD::FADD)
4363  ++nonAddCount;
4364  }
4365  if (numUses >= 5)
4366  return SDValue();
4367  if (nonAddCount) {
4368  int orderNo = N->getIROrder();
4369  int orderNo2 = N0.getNode()->getIROrder();
4370  // simple heuristics here for considering potential register
4371  // pressure, the logics here is that the differnce are used
4372  // to measure the distance between def and use, the longer distance
4373  // more likely cause register pressure.
4374  if (orderNo - orderNo2 < 500)
4375  return SDValue();
4376 
4377  // Now, check if at least one of the FMUL's operands is live beyond the node N,
4378  // which guarantees that the FMA will not increase register pressure at node N.
4379  bool opIsLive = false;
4380  const SDNode *left = N0.getOperand(0).getNode();
4381  const SDNode *right = N0.getOperand(1).getNode();
4382 
4383  if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4384  opIsLive = true;
4385 
4386  if (!opIsLive)
4387  for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4388  SDNode *User = *UI;
4389  int orderNo3 = User->getIROrder();
4390  if (orderNo3 > orderNo) {
4391  opIsLive = true;
4392  break;
4393  }
4394  }
4395 
4396  if (!opIsLive)
4397  for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4398  SDNode *User = *UI;
4399  int orderNo3 = User->getIROrder();
4400  if (orderNo3 > orderNo) {
4401  opIsLive = true;
4402  break;
4403  }
4404  }
4405 
4406  if (!opIsLive)
4407  return SDValue();
4408  }
4409 
4410  return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4411  N0.getOperand(0), N0.getOperand(1), N1);
4412  }
4413  }
4414 
4415  return SDValue();
4416 }
4417 
4418 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4419 ///
4422  const NVPTXSubtarget &Subtarget,
4423  CodeGenOpt::Level OptLevel) {
4424  SDValue N0 = N->getOperand(0);
4425  SDValue N1 = N->getOperand(1);
4426 
4427  // First try with the default operand order.
4428  if (SDValue Result =
4429  PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4430  return Result;
4431 
4432  // If that didn't work, try again with the operands commuted.
4433  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4434 }
4435 
4438  // The type legalizer turns a vector load of i8 values into a zextload to i16
4439  // registers, optionally ANY_EXTENDs it (if target type is integer),
4440  // and ANDs off the high 8 bits. Since we turn this load into a
4441  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4442  // nodes. Do that here.
4443  SDValue Val = N->getOperand(0);
4444  SDValue Mask = N->getOperand(1);
4445 
4446  if (isa<ConstantSDNode>(Val)) {
4447  std::swap(Val, Mask);
4448  }
4449 
4450  SDValue AExt;
4451  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4452  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4453  AExt = Val;
4454  Val = Val->getOperand(0);
4455  }
4456 
4457  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4458  Val = Val->getOperand(0);
4459  }
4460 
4461  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4462  Val->getOpcode() == NVPTXISD::LoadV4) {
4463  ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4464  if (!MaskCnst) {
4465  // Not an AND with a constant
4466  return SDValue();
4467  }
4468 
4469  uint64_t MaskVal = MaskCnst->getZExtValue();
4470  if (MaskVal != 0xff) {
4471  // Not an AND that chops off top 8 bits
4472  return SDValue();
4473  }
4474 
4475  MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4476  if (!Mem) {
4477  // Not a MemSDNode?!?
4478  return SDValue();
4479  }
4480 
4481  EVT MemVT = Mem->getMemoryVT();
4482  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4483  // We only handle the i8 case
4484  return SDValue();
4485  }
4486 
4487  unsigned ExtType =
4488  cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4489  getZExtValue();
4490  if (ExtType == ISD::SEXTLOAD) {
4491  // If for some reason the load is a sextload, the and is needed to zero
4492  // out the high 8 bits
4493  return SDValue();
4494  }
4495 
4496  bool AddTo = false;
4497  if (AExt.getNode() != nullptr) {
4498  //