LLVM 18.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
35#include "llvm/IR/Argument.h"
36#include "llvm/IR/Attributes.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
40#include "llvm/IR/FPEnv.h"
41#include "llvm/IR/Function.h"
42#include "llvm/IR/GlobalValue.h"
43#include "llvm/IR/Instruction.h"
45#include "llvm/IR/IntrinsicsNVPTX.h"
46#include "llvm/IR/Module.h"
47#include "llvm/IR/Type.h"
48#include "llvm/IR/Value.h"
56#include <algorithm>
57#include <cassert>
58#include <cmath>
59#include <cstdint>
60#include <iterator>
61#include <sstream>
62#include <string>
63#include <utility>
64#include <vector>
65
66#define DEBUG_TYPE "nvptx-lower"
67
68using namespace llvm;
69
70static std::atomic<unsigned> GlobalUniqueCallSite;
71
73 "nvptx-sched4reg",
74 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
75
77 "nvptx-fma-level", cl::Hidden,
78 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
79 " 1: do it 2: do it aggressively"),
80 cl::init(2));
81
83 "nvptx-prec-divf32", cl::Hidden,
84 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
85 " IEEE Compliant F32 div.rnd if available."),
86 cl::init(2));
87
89 "nvptx-prec-sqrtf32", cl::Hidden,
90 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
91 cl::init(true));
92
94 "nvptx-force-min-byval-param-align", cl::Hidden,
95 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
96 " params of device functions."),
97 cl::init(false));
98
100 if (UsePrecDivF32.getNumOccurrences() > 0) {
101 // If nvptx-prec-div32=N is used on the command-line, always honor it
102 return UsePrecDivF32;
103 } else {
104 // Otherwise, use div.approx if fast math is enabled
105 if (getTargetMachine().Options.UnsafeFPMath)
106 return 0;
107 else
108 return 2;
109 }
110}
111
113 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
114 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
115 return UsePrecSqrtF32;
116 } else {
117 // Otherwise, use sqrt.approx if fast math is enabled
119 }
120}
121
125}
126
127static bool IsPTXVectorType(MVT VT) {
128 switch (VT.SimpleTy) {
129 default:
130 return false;
131 case MVT::v2i1:
132 case MVT::v4i1:
133 case MVT::v2i8:
134 case MVT::v4i8:
135 case MVT::v2i16:
136 case MVT::v4i16:
137 case MVT::v8i16: // <4 x i16x2>
138 case MVT::v2i32:
139 case MVT::v4i32:
140 case MVT::v2i64:
141 case MVT::v2f16:
142 case MVT::v4f16:
143 case MVT::v8f16: // <4 x f16x2>
144 case MVT::v2bf16:
145 case MVT::v4bf16:
146 case MVT::v8bf16: // <4 x bf16x2>
147 case MVT::v2f32:
148 case MVT::v4f32:
149 case MVT::v2f64:
150 return true;
151 }
152}
153
154static bool Is16bitsType(MVT VT) {
155 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
156 VT.SimpleTy == MVT::i16);
157}
158
159/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
160/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
161/// into their primitive components.
162/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
163/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
164/// LowerCall, and LowerReturn.
165static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
166 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
167 SmallVectorImpl<uint64_t> *Offsets = nullptr,
168 uint64_t StartingOffset = 0) {
169 SmallVector<EVT, 16> TempVTs;
170 SmallVector<uint64_t, 16> TempOffsets;
171
172 // Special case for i128 - decompose to (i64, i64)
173 if (Ty->isIntegerTy(128)) {
174 ValueVTs.push_back(EVT(MVT::i64));
175 ValueVTs.push_back(EVT(MVT::i64));
176
177 if (Offsets) {
178 Offsets->push_back(StartingOffset + 0);
179 Offsets->push_back(StartingOffset + 8);
180 }
181
182 return;
183 }
184
185 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
186 if (StructType *STy = dyn_cast<StructType>(Ty)) {
187 auto const *SL = DL.getStructLayout(STy);
188 auto ElementNum = 0;
189 for(auto *EI : STy->elements()) {
190 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
191 StartingOffset + SL->getElementOffset(ElementNum));
192 ++ElementNum;
193 }
194 return;
195 }
196
197 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
198 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
199 EVT VT = TempVTs[i];
200 uint64_t Off = TempOffsets[i];
201 // Split vectors into individual elements, except for v2f16, which
202 // we will pass as a single scalar.
203 if (VT.isVector()) {
204 unsigned NumElts = VT.getVectorNumElements();
205 EVT EltVT = VT.getVectorElementType();
206 // Vectors with an even number of f16 elements will be passed to
207 // us as an array of v2f16/v2bf16 elements. We must match this so we
208 // stay in sync with Ins/Outs.
209 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
210 switch (EltVT.getSimpleVT().SimpleTy) {
211 case MVT::f16:
212 EltVT = MVT::v2f16;
213 break;
214 case MVT::bf16:
215 EltVT = MVT::v2bf16;
216 break;
217 case MVT::i16:
218 EltVT = MVT::v2i16;
219 break;
220 default:
221 llvm_unreachable("Unexpected type");
222 }
223 NumElts /= 2;
224 }
225 for (unsigned j = 0; j != NumElts; ++j) {
226 ValueVTs.push_back(EltVT);
227 if (Offsets)
228 Offsets->push_back(Off + j * EltVT.getStoreSize());
229 }
230 } else {
231 ValueVTs.push_back(VT);
232 if (Offsets)
233 Offsets->push_back(Off);
234 }
235 }
236}
237
238/// PromoteScalarIntegerPTX
239/// Used to make sure the arguments/returns are suitable for passing
240/// and promote them to a larger size if they're not.
241///
242/// The promoted type is placed in \p PromoteVT if the function returns true.
243static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
244 if (VT.isScalarInteger()) {
245 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
246 default:
248 "Promotion is not suitable for scalars of size larger than 64-bits");
249 case 1:
250 *PromotedVT = MVT::i1;
251 break;
252 case 2:
253 case 4:
254 case 8:
255 *PromotedVT = MVT::i8;
256 break;
257 case 16:
258 *PromotedVT = MVT::i16;
259 break;
260 case 32:
261 *PromotedVT = MVT::i32;
262 break;
263 case 64:
264 *PromotedVT = MVT::i64;
265 break;
266 }
267 return EVT(*PromotedVT) != VT;
268 }
269 return false;
270}
271
272// Check whether we can merge loads/stores of some of the pieces of a
273// flattened function parameter or return value into a single vector
274// load/store.
275//
276// The flattened parameter is represented as a list of EVTs and
277// offsets, and the whole structure is aligned to ParamAlignment. This
278// function determines whether we can load/store pieces of the
279// parameter starting at index Idx using a single vectorized op of
280// size AccessSize. If so, it returns the number of param pieces
281// covered by the vector op. Otherwise, it returns 1.
283 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
284 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
285
286 // Can't vectorize if param alignment is not sufficient.
287 if (ParamAlignment < AccessSize)
288 return 1;
289 // Can't vectorize if offset is not aligned.
290 if (Offsets[Idx] & (AccessSize - 1))
291 return 1;
292
293 EVT EltVT = ValueVTs[Idx];
294 unsigned EltSize = EltVT.getStoreSize();
295
296 // Element is too large to vectorize.
297 if (EltSize >= AccessSize)
298 return 1;
299
300 unsigned NumElts = AccessSize / EltSize;
301 // Can't vectorize if AccessBytes if not a multiple of EltSize.
302 if (AccessSize != EltSize * NumElts)
303 return 1;
304
305 // We don't have enough elements to vectorize.
306 if (Idx + NumElts > ValueVTs.size())
307 return 1;
308
309 // PTX ISA can only deal with 2- and 4-element vector ops.
310 if (NumElts != 4 && NumElts != 2)
311 return 1;
312
313 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
314 // Types do not match.
315 if (ValueVTs[j] != EltVT)
316 return 1;
317
318 // Elements are not contiguous.
319 if (Offsets[j] - Offsets[j - 1] != EltSize)
320 return 1;
321 }
322 // OK. We can vectorize ValueVTs[i..i+NumElts)
323 return NumElts;
324}
325
326// Flags for tracking per-element vectorization state of loads/stores
327// of a flattened function parameter or return value.
329 PVF_INNER = 0x0, // Middle elements of a vector.
330 PVF_FIRST = 0x1, // First element of the vector.
331 PVF_LAST = 0x2, // Last element of the vector.
332 // Scalar is effectively a 1-element vector.
335
336// Computes whether and how we can vectorize the loads/stores of a
337// flattened function parameter or return value.
338//
339// The flattened parameter is represented as the list of ValueVTs and
340// Offsets, and is aligned to ParamAlignment bytes. We return a vector
341// of the same size as ValueVTs indicating how each piece should be
342// loaded/stored (i.e. as a scalar, or as part of a vector
343// load/store).
346 const SmallVectorImpl<uint64_t> &Offsets,
347 Align ParamAlignment, bool IsVAArg = false) {
348 // Set vector size to match ValueVTs and mark all elements as
349 // scalars by default.
351 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
352
353 if (IsVAArg)
354 return VectorInfo;
355
356 // Check what we can vectorize using 128/64/32-bit accesses.
357 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
358 // Skip elements we've already processed.
359 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
360 for (unsigned AccessSize : {16, 8, 4, 2}) {
361 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
362 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
363 // Mark vectorized elements.
364 switch (NumElts) {
365 default:
366 llvm_unreachable("Unexpected return value");
367 case 1:
368 // Can't vectorize using this size, try next smaller size.
369 continue;
370 case 2:
371 assert(I + 1 < E && "Not enough elements.");
372 VectorInfo[I] = PVF_FIRST;
373 VectorInfo[I + 1] = PVF_LAST;
374 I += 1;
375 break;
376 case 4:
377 assert(I + 3 < E && "Not enough elements.");
378 VectorInfo[I] = PVF_FIRST;
379 VectorInfo[I + 1] = PVF_INNER;
380 VectorInfo[I + 2] = PVF_INNER;
381 VectorInfo[I + 3] = PVF_LAST;
382 I += 3;
383 break;
384 }
385 // Break out of the inner loop because we've already succeeded
386 // using largest possible AccessSize.
387 break;
388 }
389 }
390 return VectorInfo;
391}
392
393// NVPTXTargetLowering Constructor.
395 const NVPTXSubtarget &STI)
396 : TargetLowering(TM), nvTM(&TM), STI(STI) {
397 // always lower memset, memcpy, and memmove intrinsics to load/store
398 // instructions, rather
399 // then generating calls to memset, mempcy or memmove.
403
406
407 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
408 // condition branches.
409 setJumpIsExpensive(true);
410
411 // Wide divides are _very_ slow. Try to reduce the width of the divide if
412 // possible.
413 addBypassSlowDiv(64, 32);
414
415 // By default, use the Source scheduling
416 if (sched4reg)
418 else
420
421 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
422 LegalizeAction NoF16Action) {
423 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
424 };
425
426 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
427 LegalizeAction NoBF16Action) {
428 bool IsOpSupported = STI.hasBF16Math();
429 // Few instructions are available on sm_90 only
430 switch(Op) {
431 case ISD::FADD:
432 case ISD::FMUL:
433 case ISD::FSUB:
434 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
435 break;
436 }
438 Op, VT, IsOpSupported ? Action : NoBF16Action);
439 };
440
441 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
442 LegalizeAction NoI16x2Action) {
443 bool IsOpSupported = false;
444 // instructions are available on sm_90 only
445 switch (Op) {
446 case ISD::ADD:
447 case ISD::SMAX:
448 case ISD::SMIN:
449 case ISD::UMIN:
450 case ISD::UMAX:
451 case ISD::SUB:
452 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
453 break;
454 }
455 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
456 };
457
458 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
459 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
460 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
461 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
462 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
463 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
464 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
465 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
466 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
467 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
468 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
469
470 // Conversion to/from FP16/FP16x2 is always legal.
475
476 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
477 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
478
479 // Conversion to/from BFP16/BFP16x2 is always legal.
484
485 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
486 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
487
488 // Conversion to/from i16/i16x2 is always legal.
493
494 // Operations not directly supported by NVPTX.
495 for (MVT VT :
496 {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::f64,
497 MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}) {
500 }
501
502 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
503 // For others we will expand to a SHL/SRA pair.
510
517
520
521 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
522 // that don't have h/w rotation we lower them to multi-instruction assembly.
523 // See ROT*_sw in NVPTXIntrInfo.td
528
530 setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
532 setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
539
540 // Indirect branch is not supported.
541 // This also disables Jump Table creation.
544
547
548 // We want to legalize constant related memmove and memcopy
549 // intrinsics.
551
552 // Turn FP extload into load/fpextend
553 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
554 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
555 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
556 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
557 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
558 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
559 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
560 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
561 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
562 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
563 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
564 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
565 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
566 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
567 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
568 // Turn FP truncstore into trunc + store.
569 // FIXME: vector types should also be expanded
570 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
571 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
572 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
573 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
574 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
575
576 // PTX does not support load / store predicate registers
579
580 for (MVT VT : MVT::integer_valuetypes()) {
583 setTruncStoreAction(VT, MVT::i1, Expand);
584 }
585
586 // expand extload of vector of integers.
588 MVT::v2i8, Expand);
589 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
590
591 // This is legal in NVPTX
596
597 // TRAP can be lowered to PTX trap
598 setOperationAction(ISD::TRAP, MVT::Other, Legal);
599
600 // Register custom handling for vector loads/stores
602 if (IsPTXVectorType(VT)) {
606 }
607 }
608
609 // Support varargs.
614
615 // Custom handling for i8 intrinsics
617
618 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
624
627 }
628
629 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
630 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
631 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
632 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
633 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
634 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
635 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
636
637 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
638 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
639 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
640 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
641 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
642 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
643
644 // Other arithmetic and logic ops are unsupported.
649 MVT::v2i16, Expand);
650
655 if (STI.getPTXVersion() >= 43) {
660 }
661
663 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
666
667 // PTX does not directly support SELP of i1, so promote to i32 first
669
670 // PTX cannot multiply two i64s in a single instruction.
673
674 // We have some custom DAG combine patterns for these nodes
677
678 // setcc for f16x2 and bf16x2 needs special handling to prevent
679 // legalizer's attempt to scalarize it due to v2i1 not being legal.
680 if (STI.allowFP16Math() || STI.hasBF16Math())
682
683 // Promote fp16 arithmetic if fp16 hardware isn't available or the
684 // user passed --nvptx-no-fp16-math. The flag is useful because,
685 // although sm_53+ GPUs have some sort of FP16 support in
686 // hardware, only sm_53 and sm_60 have full implementation. Others
687 // only have token amount of hardware and are likely to run faster
688 // by using fp32 units instead.
689 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
690 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
691 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
692 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
693 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
694 // bf16 must be promoted to f32.
695 if (getOperationAction(Op, MVT::bf16) == Promote)
696 AddPromotedToType(Op, MVT::bf16, MVT::f32);
697 }
698
699 // f16/f16x2 neg was introduced in PTX 60, SM_53.
700 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
701 STI.getPTXVersion() >= 60 &&
702 STI.allowFP16Math();
703 for (const auto &VT : {MVT::f16, MVT::v2f16})
705 IsFP16FP16x2NegAvailable ? Legal : Expand);
706
707 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
708 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
709 // (would be) Library functions.
710
711 // These map to conversion instructions for scalar FP types.
712 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
714 setOperationAction(Op, MVT::bf16, Legal);
715 setOperationAction(Op, MVT::f16, Legal);
716 setOperationAction(Op, MVT::f32, Legal);
717 setOperationAction(Op, MVT::f64, Legal);
718 setOperationAction(Op, MVT::v2f16, Expand);
719 setOperationAction(Op, MVT::v2bf16, Expand);
720 }
721
728
729
730 // 'Expand' implements FCOPYSIGN without calling an external library.
737
738 // These map to corresponding instructions for f32/f64. f16 must be
739 // promoted to f32. v2f16 is expanded to f16, which is then promoted
740 // to f32.
741 for (const auto &Op :
743 setOperationAction(Op, MVT::f16, Promote);
744 setOperationAction(Op, MVT::bf16, Promote);
745 setOperationAction(Op, MVT::f32, Legal);
746 setOperationAction(Op, MVT::f64, Legal);
747 setOperationAction(Op, MVT::v2f16, Expand);
748 setOperationAction(Op, MVT::v2bf16, Expand);
749 }
750 // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
751 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
752 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
753 return IsAtLeastSm80 ? Legal : NotSm80Action;
754 };
755 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
756 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
757 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
758 setOperationAction(Op, MVT::f32, Legal);
759 setOperationAction(Op, MVT::f64, Legal);
760 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
761 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
762 }
763 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
764 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
765 setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
766 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
767 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
768 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
769 }
770
771 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
772 // No FPOW or FREM in PTX.
773
774 // Now deduce the information based on the above mentioned
775 // actions
777
779}
780
781const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
782 switch ((NVPTXISD::NodeType)Opcode) {
784 break;
785 case NVPTXISD::CALL:
786 return "NVPTXISD::CALL";
788 return "NVPTXISD::RET_GLUE";
790 return "NVPTXISD::LOAD_PARAM";
792 return "NVPTXISD::Wrapper";
794 return "NVPTXISD::DeclareParam";
796 return "NVPTXISD::DeclareScalarParam";
798 return "NVPTXISD::DeclareRet";
800 return "NVPTXISD::DeclareScalarRet";
802 return "NVPTXISD::DeclareRetParam";
804 return "NVPTXISD::PrintCall";
806 return "NVPTXISD::PrintConvergentCall";
808 return "NVPTXISD::PrintCallUni";
810 return "NVPTXISD::PrintConvergentCallUni";
812 return "NVPTXISD::LoadParam";
814 return "NVPTXISD::LoadParamV2";
816 return "NVPTXISD::LoadParamV4";
818 return "NVPTXISD::StoreParam";
820 return "NVPTXISD::StoreParamV2";
822 return "NVPTXISD::StoreParamV4";
824 return "NVPTXISD::StoreParamS32";
826 return "NVPTXISD::StoreParamU32";
828 return "NVPTXISD::CallArgBegin";
830 return "NVPTXISD::CallArg";
832 return "NVPTXISD::LastCallArg";
834 return "NVPTXISD::CallArgEnd";
836 return "NVPTXISD::CallVoid";
838 return "NVPTXISD::CallVal";
840 return "NVPTXISD::CallSymbol";
842 return "NVPTXISD::Prototype";
844 return "NVPTXISD::MoveParam";
846 return "NVPTXISD::StoreRetval";
848 return "NVPTXISD::StoreRetvalV2";
850 return "NVPTXISD::StoreRetvalV4";
852 return "NVPTXISD::PseudoUseParam";
853 case NVPTXISD::RETURN:
854 return "NVPTXISD::RETURN";
856 return "NVPTXISD::CallSeqBegin";
858 return "NVPTXISD::CallSeqEnd";
860 return "NVPTXISD::CallPrototype";
862 return "NVPTXISD::ProxyReg";
863 case NVPTXISD::LoadV2:
864 return "NVPTXISD::LoadV2";
865 case NVPTXISD::LoadV4:
866 return "NVPTXISD::LoadV4";
867 case NVPTXISD::LDGV2:
868 return "NVPTXISD::LDGV2";
869 case NVPTXISD::LDGV4:
870 return "NVPTXISD::LDGV4";
871 case NVPTXISD::LDUV2:
872 return "NVPTXISD::LDUV2";
873 case NVPTXISD::LDUV4:
874 return "NVPTXISD::LDUV4";
876 return "NVPTXISD::StoreV2";
878 return "NVPTXISD::StoreV4";
880 return "NVPTXISD::FUN_SHFL_CLAMP";
882 return "NVPTXISD::FUN_SHFR_CLAMP";
883 case NVPTXISD::IMAD:
884 return "NVPTXISD::IMAD";
886 return "NVPTXISD::SETP_F16X2";
887 case NVPTXISD::Dummy:
888 return "NVPTXISD::Dummy";
890 return "NVPTXISD::MUL_WIDE_SIGNED";
892 return "NVPTXISD::MUL_WIDE_UNSIGNED";
893 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
894 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
896 return "NVPTXISD::Tex1DFloatFloatLevel";
898 return "NVPTXISD::Tex1DFloatFloatGrad";
899 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
900 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
902 return "NVPTXISD::Tex1DS32FloatLevel";
904 return "NVPTXISD::Tex1DS32FloatGrad";
905 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
906 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
908 return "NVPTXISD::Tex1DU32FloatLevel";
910 return "NVPTXISD::Tex1DU32FloatGrad";
911 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
912 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
914 return "NVPTXISD::Tex1DArrayFloatFloatLevel";
916 return "NVPTXISD::Tex1DArrayFloatFloatGrad";
917 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
918 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
920 return "NVPTXISD::Tex1DArrayS32FloatLevel";
922 return "NVPTXISD::Tex1DArrayS32FloatGrad";
923 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
924 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
926 return "NVPTXISD::Tex1DArrayU32FloatLevel";
928 return "NVPTXISD::Tex1DArrayU32FloatGrad";
929 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
930 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
932 return "NVPTXISD::Tex2DFloatFloatLevel";
934 return "NVPTXISD::Tex2DFloatFloatGrad";
935 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
936 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
938 return "NVPTXISD::Tex2DS32FloatLevel";
940 return "NVPTXISD::Tex2DS32FloatGrad";
941 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
942 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
944 return "NVPTXISD::Tex2DU32FloatLevel";
946 return "NVPTXISD::Tex2DU32FloatGrad";
947 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
948 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
950 return "NVPTXISD::Tex2DArrayFloatFloatLevel";
952 return "NVPTXISD::Tex2DArrayFloatFloatGrad";
953 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
954 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
956 return "NVPTXISD::Tex2DArrayS32FloatLevel";
958 return "NVPTXISD::Tex2DArrayS32FloatGrad";
959 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
960 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
962 return "NVPTXISD::Tex2DArrayU32FloatLevel";
964 return "NVPTXISD::Tex2DArrayU32FloatGrad";
965 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
966 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
968 return "NVPTXISD::Tex3DFloatFloatLevel";
970 return "NVPTXISD::Tex3DFloatFloatGrad";
971 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
972 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
974 return "NVPTXISD::Tex3DS32FloatLevel";
976 return "NVPTXISD::Tex3DS32FloatGrad";
977 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
978 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
980 return "NVPTXISD::Tex3DU32FloatLevel";
982 return "NVPTXISD::Tex3DU32FloatGrad";
983 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
985 return "NVPTXISD::TexCubeFloatFloatLevel";
986 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
988 return "NVPTXISD::TexCubeS32FloatLevel";
989 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
991 return "NVPTXISD::TexCubeU32FloatLevel";
993 return "NVPTXISD::TexCubeArrayFloatFloat";
995 return "NVPTXISD::TexCubeArrayFloatFloatLevel";
997 return "NVPTXISD::TexCubeArrayS32Float";
999 return "NVPTXISD::TexCubeArrayS32FloatLevel";
1001 return "NVPTXISD::TexCubeArrayU32Float";
1003 return "NVPTXISD::TexCubeArrayU32FloatLevel";
1005 return "NVPTXISD::Tld4R2DFloatFloat";
1007 return "NVPTXISD::Tld4G2DFloatFloat";
1009 return "NVPTXISD::Tld4B2DFloatFloat";
1011 return "NVPTXISD::Tld4A2DFloatFloat";
1013 return "NVPTXISD::Tld4R2DS64Float";
1015 return "NVPTXISD::Tld4G2DS64Float";
1017 return "NVPTXISD::Tld4B2DS64Float";
1019 return "NVPTXISD::Tld4A2DS64Float";
1021 return "NVPTXISD::Tld4R2DU64Float";
1023 return "NVPTXISD::Tld4G2DU64Float";
1025 return "NVPTXISD::Tld4B2DU64Float";
1027 return "NVPTXISD::Tld4A2DU64Float";
1028
1030 return "NVPTXISD::TexUnified1DFloatS32";
1032 return "NVPTXISD::TexUnified1DFloatFloat";
1034 return "NVPTXISD::TexUnified1DFloatFloatLevel";
1036 return "NVPTXISD::TexUnified1DFloatFloatGrad";
1038 return "NVPTXISD::TexUnified1DS32S32";
1040 return "NVPTXISD::TexUnified1DS32Float";
1042 return "NVPTXISD::TexUnified1DS32FloatLevel";
1044 return "NVPTXISD::TexUnified1DS32FloatGrad";
1046 return "NVPTXISD::TexUnified1DU32S32";
1048 return "NVPTXISD::TexUnified1DU32Float";
1050 return "NVPTXISD::TexUnified1DU32FloatLevel";
1052 return "NVPTXISD::TexUnified1DU32FloatGrad";
1054 return "NVPTXISD::TexUnified1DArrayFloatS32";
1056 return "NVPTXISD::TexUnified1DArrayFloatFloat";
1058 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
1060 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
1062 return "NVPTXISD::TexUnified1DArrayS32S32";
1064 return "NVPTXISD::TexUnified1DArrayS32Float";
1066 return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
1068 return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
1070 return "NVPTXISD::TexUnified1DArrayU32S32";
1072 return "NVPTXISD::TexUnified1DArrayU32Float";
1074 return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
1076 return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
1078 return "NVPTXISD::TexUnified2DFloatS32";
1080 return "NVPTXISD::TexUnified2DFloatFloat";
1082 return "NVPTXISD::TexUnified2DFloatFloatLevel";
1084 return "NVPTXISD::TexUnified2DFloatFloatGrad";
1086 return "NVPTXISD::TexUnified2DS32S32";
1088 return "NVPTXISD::TexUnified2DS32Float";
1090 return "NVPTXISD::TexUnified2DS32FloatLevel";
1092 return "NVPTXISD::TexUnified2DS32FloatGrad";
1094 return "NVPTXISD::TexUnified2DU32S32";
1096 return "NVPTXISD::TexUnified2DU32Float";
1098 return "NVPTXISD::TexUnified2DU32FloatLevel";
1100 return "NVPTXISD::TexUnified2DU32FloatGrad";
1102 return "NVPTXISD::TexUnified2DArrayFloatS32";
1104 return "NVPTXISD::TexUnified2DArrayFloatFloat";
1106 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
1108 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
1110 return "NVPTXISD::TexUnified2DArrayS32S32";
1112 return "NVPTXISD::TexUnified2DArrayS32Float";
1114 return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
1116 return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
1118 return "NVPTXISD::TexUnified2DArrayU32S32";
1120 return "NVPTXISD::TexUnified2DArrayU32Float";
1122 return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
1124 return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
1126 return "NVPTXISD::TexUnified3DFloatS32";
1128 return "NVPTXISD::TexUnified3DFloatFloat";
1130 return "NVPTXISD::TexUnified3DFloatFloatLevel";
1132 return "NVPTXISD::TexUnified3DFloatFloatGrad";
1134 return "NVPTXISD::TexUnified3DS32S32";
1136 return "NVPTXISD::TexUnified3DS32Float";
1138 return "NVPTXISD::TexUnified3DS32FloatLevel";
1140 return "NVPTXISD::TexUnified3DS32FloatGrad";
1142 return "NVPTXISD::TexUnified3DU32S32";
1144 return "NVPTXISD::TexUnified3DU32Float";
1146 return "NVPTXISD::TexUnified3DU32FloatLevel";
1148 return "NVPTXISD::TexUnified3DU32FloatGrad";
1150 return "NVPTXISD::TexUnifiedCubeFloatFloat";
1152 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
1154 return "NVPTXISD::TexUnifiedCubeS32Float";
1156 return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
1158 return "NVPTXISD::TexUnifiedCubeU32Float";
1160 return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
1162 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
1164 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
1166 return "NVPTXISD::TexUnifiedCubeArrayS32Float";
1168 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
1170 return "NVPTXISD::TexUnifiedCubeArrayU32Float";
1172 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
1174 return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
1176 return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
1178 return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
1180 return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
1182 return "NVPTXISD::Tld4UnifiedR2DS64Float";
1184 return "NVPTXISD::Tld4UnifiedG2DS64Float";
1186 return "NVPTXISD::Tld4UnifiedB2DS64Float";
1188 return "NVPTXISD::Tld4UnifiedA2DS64Float";
1190 return "NVPTXISD::Tld4UnifiedR2DU64Float";
1192 return "NVPTXISD::Tld4UnifiedG2DU64Float";
1194 return "NVPTXISD::Tld4UnifiedB2DU64Float";
1196 return "NVPTXISD::Tld4UnifiedA2DU64Float";
1197
1198 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1199 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1200 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1201 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1202 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1203 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1204 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1205 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1206 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1207 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1208 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1209
1210 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1211 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1212 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1213 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1214 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1215 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1216 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1217 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1218 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1219 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1220 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1221
1222 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1223 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1224 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1225 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1226 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1227 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1228 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1229 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1230 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1231 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1232 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1233
1234 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1235 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1236 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1237 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1238 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1239 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1240 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1241 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1242 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1243 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1244 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1245
1246 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1247 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1248 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1249 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1250 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1251 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1252 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1253 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1254 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1255 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1256 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1257
1258 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1259 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1260 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1261 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1262 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1263 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1264 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1265 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1266 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1267 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1268 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1269
1270 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1271 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1272 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1273 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1274 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1275 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1276 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1277 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1278 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1279 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1280 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1281
1282 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1283 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1284 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1285 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1286 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1287 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1288 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1289 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1290 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1291 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1292 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1293
1294 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1295 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1296 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1297 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1298 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1299 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1300 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1301 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1302 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1303 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1304 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1305
1306 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1307 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1308 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1309 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1310 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1311 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1312 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1313 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1314 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1315 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1316 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1317
1318 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1319 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1320 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1321 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1322 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1323 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1324 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1325 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1326 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1327 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1328 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1329
1330 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1331 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1332 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1333 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1334 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1335 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1336 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1337 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1338 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1339 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1340 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1341
1342 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1343 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1344 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1345 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1346 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1347 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1348 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1349 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1350 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1351 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1352 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1353
1354 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1355 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1356 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1357 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1358 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1359 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1360 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1361 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1362 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1363 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1364 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1365
1366 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1367 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1368 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1369 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1370 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1371 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1372 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1373 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1374 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1375 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1376 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1377 }
1378 return nullptr;
1379}
1380
1383 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1384 VT.getScalarType() == MVT::i1)
1385 return TypeSplitVector;
1386 if (Isv2x16VT(VT))
1387 return TypeLegal;
1389}
1390
1392 int Enabled, int &ExtraSteps,
1393 bool &UseOneConst,
1394 bool Reciprocal) const {
1397 return SDValue();
1398
1399 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1400 ExtraSteps = 0;
1401
1402 SDLoc DL(Operand);
1403 EVT VT = Operand.getValueType();
1404 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1405
1406 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1407 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1408 DAG.getConstant(IID, DL, MVT::i32), Operand);
1409 };
1410
1411 // The sqrt and rsqrt refinement processes assume we always start out with an
1412 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1413 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1414 // any refinement, we must return a regular sqrt.
1415 if (Reciprocal || ExtraSteps > 0) {
1416 if (VT == MVT::f32)
1417 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1418 : Intrinsic::nvvm_rsqrt_approx_f);
1419 else if (VT == MVT::f64)
1420 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1421 else
1422 return SDValue();
1423 } else {
1424 if (VT == MVT::f32)
1425 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1426 : Intrinsic::nvvm_sqrt_approx_f);
1427 else {
1428 // There's no sqrt.approx.f64 instruction, so we emit
1429 // reciprocal(rsqrt(x)). This is faster than
1430 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1431 // x * rsqrt(x).)
1432 return DAG.getNode(
1434 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1435 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1436 }
1437 }
1438}
1439
1440SDValue
1442 SDLoc dl(Op);
1443 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1444 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1445 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1446 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1447}
1448
1449static bool IsTypePassedAsArray(const Type *Ty) {
1450 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1451 Ty->isHalfTy() || Ty->isBFloatTy();
1452}
1453
1455 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1456 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1457 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1458 const CallBase &CB, unsigned UniqueCallSite) const {
1459 auto PtrVT = getPointerTy(DL);
1460
1461 bool isABI = (STI.getSmVersion() >= 20);
1462 assert(isABI && "Non-ABI compilation is not supported");
1463 if (!isABI)
1464 return "";
1465
1466 std::string Prototype;
1467 raw_string_ostream O(Prototype);
1468 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1469
1470 if (retTy->getTypeID() == Type::VoidTyID) {
1471 O << "()";
1472 } else {
1473 O << "(";
1474 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1475 !IsTypePassedAsArray(retTy)) {
1476 unsigned size = 0;
1477 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1478 size = ITy->getBitWidth();
1479 } else {
1480 assert(retTy->isFloatingPointTy() &&
1481 "Floating point type expected here");
1482 size = retTy->getPrimitiveSizeInBits();
1483 }
1484 // PTX ABI requires all scalar return values to be at least 32
1485 // bits in size. fp16 normally uses .b16 as its storage type in
1486 // PTX, so its size must be adjusted here, too.
1488
1489 O << ".param .b" << size << " _";
1490 } else if (isa<PointerType>(retTy)) {
1491 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1492 } else if (IsTypePassedAsArray(retTy)) {
1493 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1494 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1495 } else {
1496 llvm_unreachable("Unknown return type");
1497 }
1498 O << ") ";
1499 }
1500 O << "_ (";
1501
1502 bool first = true;
1503
1504 const Function *F = CB.getFunction();
1505 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1506 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1507 Type *Ty = Args[i].Ty;
1508 if (!first) {
1509 O << ", ";
1510 }
1511 first = false;
1512
1513 if (!Outs[OIdx].Flags.isByVal()) {
1514 if (IsTypePassedAsArray(Ty)) {
1515 unsigned ParamAlign = 0;
1516 const CallInst *CallI = cast<CallInst>(&CB);
1517 // +1 because index 0 is reserved for return type alignment
1518 if (!getAlign(*CallI, i + 1, ParamAlign))
1519 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1520 O << ".param .align " << ParamAlign << " .b8 ";
1521 O << "_";
1522 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1523 // update the index for Outs
1524 SmallVector<EVT, 16> vtparts;
1525 ComputeValueVTs(*this, DL, Ty, vtparts);
1526 if (unsigned len = vtparts.size())
1527 OIdx += len - 1;
1528 continue;
1529 }
1530 // i8 types in IR will be i16 types in SDAG
1531 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1532 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1533 "type mismatch between callee prototype and arguments");
1534 // scalar type
1535 unsigned sz = 0;
1536 if (isa<IntegerType>(Ty)) {
1537 sz = cast<IntegerType>(Ty)->getBitWidth();
1539 } else if (isa<PointerType>(Ty)) {
1540 sz = PtrVT.getSizeInBits();
1541 } else {
1542 sz = Ty->getPrimitiveSizeInBits();
1543 }
1544 O << ".param .b" << sz << " ";
1545 O << "_";
1546 continue;
1547 }
1548
1549 Type *ETy = Args[i].IndirectType;
1550 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1551 Align ParamByValAlign =
1552 getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1553
1554 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1555 O << "_";
1556 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1557 }
1558
1559 if (VAInfo)
1560 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1561 << " .b8 _[]\n";
1562 O << ")";
1564 O << " .noreturn";
1565 O << ";";
1566
1567 return Prototype;
1568}
1569
1570Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1571 const CallBase *CB, Type *Ty,
1572 unsigned Idx,
1573 const DataLayout &DL) const {
1574 if (!CB) {
1575 // CallSite is zero, fallback to ABI type alignment
1576 return DL.getABITypeAlign(Ty);
1577 }
1578
1579 unsigned Alignment = 0;
1580 const Function *DirectCallee = CB->getCalledFunction();
1581
1582 if (!DirectCallee) {
1583 // We don't have a direct function symbol, but that may be because of
1584 // constant cast instructions in the call.
1585
1586 // With bitcast'd call targets, the instruction will be the call
1587 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1588 // Check if we have call alignment metadata
1589 if (getAlign(*CI, Idx, Alignment))
1590 return Align(Alignment);
1591 }
1592 DirectCallee = getMaybeBitcastedCallee(CB);
1593 }
1594
1595 // Check for function alignment information if we found that the
1596 // ultimate target is a Function
1597 if (DirectCallee) {
1598 if (getAlign(*DirectCallee, Idx, Alignment))
1599 return Align(Alignment);
1600 // If alignment information is not available, fall back to the
1601 // default function param optimized type alignment
1602 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1603 }
1604
1605 // Call is indirect, fall back to the ABI type alignment
1606 return DL.getABITypeAlign(Ty);
1607}
1608
1610 SmallVectorImpl<SDValue> &InVals) const {
1611
1612 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1614 "Support for variadic functions (unsized array parameter) introduced "
1615 "in PTX ISA version 6.0 and requires target sm_30.");
1616
1617 SelectionDAG &DAG = CLI.DAG;
1618 SDLoc dl = CLI.DL;
1620 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1622 SDValue Chain = CLI.Chain;
1623 SDValue Callee = CLI.Callee;
1624 bool &isTailCall = CLI.IsTailCall;
1625 ArgListTy &Args = CLI.getArgs();
1626 Type *RetTy = CLI.RetTy;
1627 const CallBase *CB = CLI.CB;
1628 const DataLayout &DL = DAG.getDataLayout();
1629
1630 bool isABI = (STI.getSmVersion() >= 20);
1631 assert(isABI && "Non-ABI compilation is not supported");
1632 if (!isABI)
1633 return Chain;
1634
1635 // Variadic arguments.
1636 //
1637 // Normally, for each argument, we declare a param scalar or a param
1638 // byte array in the .param space, and store the argument value to that
1639 // param scalar or array starting at offset 0.
1640 //
1641 // In the case of the first variadic argument, we declare a vararg byte array
1642 // with size 0. The exact size of this array isn't known at this point, so
1643 // it'll be patched later. All the variadic arguments will be stored to this
1644 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1645 // initially set to 0, so it can be used for non-variadic arguments (which use
1646 // 0 offset) to simplify the code.
1647 //
1648 // After all vararg is processed, 'VAOffset' holds the size of the
1649 // vararg byte array.
1650
1651 SDValue VADeclareParam; // vararg byte array
1652 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1653 unsigned VAOffset = 0; // current offset in the param array
1654
1655 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1656 SDValue TempChain = Chain;
1657 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1658 SDValue InGlue = Chain.getValue(1);
1659
1660 unsigned ParamCount = 0;
1661 // Args.size() and Outs.size() need not match.
1662 // Outs.size() will be larger
1663 // * if there is an aggregate argument with multiple fields (each field
1664 // showing up separately in Outs)
1665 // * if there is a vector argument with more than typical vector-length
1666 // elements (generally if more than 4) where each vector element is
1667 // individually present in Outs.
1668 // So a different index should be used for indexing into Outs/OutVals.
1669 // See similar issue in LowerFormalArguments.
1670 unsigned OIdx = 0;
1671 // Declare the .params or .reg need to pass values
1672 // to the function
1673 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1674 EVT VT = Outs[OIdx].VT;
1675 Type *Ty = Args[i].Ty;
1676 bool IsVAArg = (i >= CLI.NumFixedArgs);
1677 bool IsByVal = Outs[OIdx].Flags.isByVal();
1678
1681
1682 assert((!IsByVal || Args[i].IndirectType) &&
1683 "byval arg must have indirect type");
1684 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1685 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1686
1687 Align ArgAlign;
1688 if (IsByVal) {
1689 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1690 // so we don't need to worry whether it's naturally aligned or not.
1691 // See TargetLowering::LowerCallTo().
1692 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1693 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1694 InitialAlign, DL);
1695 if (IsVAArg)
1696 VAOffset = alignTo(VAOffset, ArgAlign);
1697 } else {
1698 ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
1699 }
1700
1701 unsigned TypeSize =
1702 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1703 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1704
1705 bool NeedAlign; // Does argument declaration specify alignment?
1706 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1707 if (IsVAArg) {
1708 if (ParamCount == FirstVAArg) {
1709 SDValue DeclareParamOps[] = {
1710 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1711 DAG.getConstant(ParamCount, dl, MVT::i32),
1712 DAG.getConstant(1, dl, MVT::i32), InGlue};
1713 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1714 DeclareParamVTs, DeclareParamOps);
1715 }
1716 NeedAlign = PassAsArray;
1717 } else if (PassAsArray) {
1718 // declare .param .align <align> .b8 .param<n>[<size>];
1719 SDValue DeclareParamOps[] = {
1720 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1721 DAG.getConstant(ParamCount, dl, MVT::i32),
1722 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1723 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1724 DeclareParamOps);
1725 NeedAlign = true;
1726 } else {
1727 // declare .param .b<size> .param<n>;
1728 if (VT.isInteger() || VT.isFloatingPoint()) {
1729 // PTX ABI requires integral types to be at least 32 bits in
1730 // size. FP16 is loaded/stored using i16, so it's handled
1731 // here as well.
1733 }
1734 SDValue DeclareScalarParamOps[] = {
1735 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1736 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1737 DAG.getConstant(0, dl, MVT::i32), InGlue};
1738 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1739 DeclareScalarParamOps);
1740 NeedAlign = false;
1741 }
1742 InGlue = Chain.getValue(1);
1743
1744 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1745 // than 32-bits are sign extended or zero extended, depending on
1746 // whether they are signed or unsigned types. This case applies
1747 // only to scalar parameters and not to aggregate values.
1748 bool ExtendIntegerParam =
1749 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1750
1751 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1752 SmallVector<SDValue, 6> StoreOperands;
1753 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1754 EVT EltVT = VTs[j];
1755 int CurOffset = Offsets[j];
1756 MaybeAlign PartAlign;
1757 if (NeedAlign)
1758 PartAlign = commonAlignment(ArgAlign, CurOffset);
1759
1760 // New store.
1761 if (VectorInfo[j] & PVF_FIRST) {
1762 assert(StoreOperands.empty() && "Unfinished preceding store.");
1763 StoreOperands.push_back(Chain);
1764 StoreOperands.push_back(
1765 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1766 StoreOperands.push_back(DAG.getConstant(
1767 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1768 dl, MVT::i32));
1769 }
1770
1771 SDValue StVal = OutVals[OIdx];
1772
1773 MVT PromotedVT;
1774 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1775 EltVT = EVT(PromotedVT);
1776 }
1777 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1779 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1780 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1781 }
1782
1783 if (IsByVal) {
1784 auto PtrVT = getPointerTy(DL);
1785 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1786 DAG.getConstant(CurOffset, dl, PtrVT));
1787 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1788 PartAlign);
1789 } else if (ExtendIntegerParam) {
1790 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1791 // zext/sext to i32
1792 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1794 dl, MVT::i32, StVal);
1795 }
1796
1797 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1798 // Use 16-bit registers for small stores as it's the
1799 // smallest general purpose register size supported by NVPTX.
1800 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1801 }
1802
1803 // Record the value to store.
1804 StoreOperands.push_back(StVal);
1805
1806 if (VectorInfo[j] & PVF_LAST) {
1807 unsigned NumElts = StoreOperands.size() - 3;
1809 switch (NumElts) {
1810 case 1:
1812 break;
1813 case 2:
1815 break;
1816 case 4:
1818 break;
1819 default:
1820 llvm_unreachable("Invalid vector info.");
1821 }
1822
1823 StoreOperands.push_back(InGlue);
1824
1825 // Adjust type of the store op if we've extended the scalar
1826 // return value.
1827 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1828
1829 Chain = DAG.getMemIntrinsicNode(
1830 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1831 TheStoreType, MachinePointerInfo(), PartAlign,
1833 InGlue = Chain.getValue(1);
1834
1835 // Cleanup.
1836 StoreOperands.clear();
1837
1838 // TODO: We may need to support vector types that can be passed
1839 // as scalars in variadic arguments.
1840 if (!IsByVal && IsVAArg) {
1841 assert(NumElts == 1 &&
1842 "Vectorization is expected to be disabled for variadics.");
1843 VAOffset += DL.getTypeAllocSize(
1844 TheStoreType.getTypeForEVT(*DAG.getContext()));
1845 }
1846 }
1847 if (!IsByVal)
1848 ++OIdx;
1849 }
1850 assert(StoreOperands.empty() && "Unfinished parameter store.");
1851 if (!IsByVal && VTs.size() > 0)
1852 --OIdx;
1853 ++ParamCount;
1854 if (IsByVal && IsVAArg)
1855 VAOffset += TypeSize;
1856 }
1857
1858 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1859 MaybeAlign retAlignment = std::nullopt;
1860
1861 // Handle Result
1862 if (Ins.size() > 0) {
1863 SmallVector<EVT, 16> resvtparts;
1864 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1865
1866 // Declare
1867 // .param .align N .b8 retval0[<size-in-bytes>], or
1868 // .param .b<size-in-bits> retval0
1869 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1870 if (!IsTypePassedAsArray(RetTy)) {
1871 resultsz = promoteScalarArgumentSize(resultsz);
1872 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1873 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1874 DAG.getConstant(resultsz, dl, MVT::i32),
1875 DAG.getConstant(0, dl, MVT::i32), InGlue };
1876 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1877 DeclareRetOps);
1878 InGlue = Chain.getValue(1);
1879 } else {
1880 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1881 assert(retAlignment && "retAlignment is guaranteed to be set");
1882 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1883 SDValue DeclareRetOps[] = {
1884 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1885 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1886 DAG.getConstant(0, dl, MVT::i32), InGlue};
1887 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1888 DeclareRetOps);
1889 InGlue = Chain.getValue(1);
1890 }
1891 }
1892
1893 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1894 // Set the size of the vararg param byte array if the callee is a variadic
1895 // function and the variadic part is not empty.
1896 if (HasVAArgs) {
1897 SDValue DeclareParamOps[] = {
1898 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1899 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1900 VADeclareParam.getOperand(4)};
1901 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1902 VADeclareParam->getVTList(), DeclareParamOps);
1903 }
1904
1905 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1906 // between them we must rely on the call site value which is valid for
1907 // indirect calls but is always null for libcalls.
1908 bool isIndirectCall = !Func && CB;
1909
1910 if (isa<ExternalSymbolSDNode>(Callee)) {
1911 Function* CalleeFunc = nullptr;
1912
1913 // Try to find the callee in the current module.
1914 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1915 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1916
1917 // Set the "libcall callee" attribute to indicate that the function
1918 // must always have a declaration.
1919 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1920 }
1921
1922 if (isIndirectCall) {
1923 // This is indirect function call case : PTX requires a prototype of the
1924 // form
1925 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1926 // to be emitted, and the label has to used as the last arg of call
1927 // instruction.
1928 // The prototype is embedded in a string and put as the operand for a
1929 // CallPrototype SDNode which will print out to the value of the string.
1930 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1931 std::string Proto = getPrototype(
1932 DL, RetTy, Args, Outs, retAlignment,
1933 HasVAArgs
1934 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1935 CLI.NumFixedArgs,
1936 cast<ConstantSDNode>(VADeclareParam->getOperand(1))
1937 ->getAPIntValue()))
1938 : std::nullopt,
1939 *CB, UniqueCallSite);
1940 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1941 SDValue ProtoOps[] = {
1942 Chain,
1943 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1944 InGlue,
1945 };
1946 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1947 InGlue = Chain.getValue(1);
1948 }
1949 // Op to just print "call"
1950 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1951 SDValue PrintCallOps[] = {
1952 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1953 };
1954 // We model convergent calls as separate opcodes.
1956 if (CLI.IsConvergent)
1959 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1960 InGlue = Chain.getValue(1);
1961
1962 // Ops to print out the function name
1963 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1964 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
1965 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1966 InGlue = Chain.getValue(1);
1967
1968 // Ops to print out the param list
1969 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1970 SDValue CallArgBeginOps[] = { Chain, InGlue };
1971 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1972 CallArgBeginOps);
1973 InGlue = Chain.getValue(1);
1974
1975 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
1976 ++i) {
1977 unsigned opcode;
1978 if (i == (e - 1))
1979 opcode = NVPTXISD::LastCallArg;
1980 else
1981 opcode = NVPTXISD::CallArg;
1982 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1983 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1984 DAG.getConstant(i, dl, MVT::i32), InGlue };
1985 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1986 InGlue = Chain.getValue(1);
1987 }
1988 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1989 SDValue CallArgEndOps[] = { Chain,
1990 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1991 InGlue };
1992 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1993 InGlue = Chain.getValue(1);
1994
1995 if (isIndirectCall) {
1996 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1997 SDValue PrototypeOps[] = {
1998 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
1999 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2000 InGlue = Chain.getValue(1);
2001 }
2002
2003 SmallVector<SDValue, 16> ProxyRegOps;
2004 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2005
2006 // Generate loads from param memory/moves from registers for result
2007 if (Ins.size() > 0) {
2010 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2011 assert(VTs.size() == Ins.size() && "Bad value decomposition");
2012
2013 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
2014 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2015
2016 SmallVector<EVT, 6> LoadVTs;
2017 int VecIdx = -1; // Index of the first element of the vector.
2018
2019 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2020 // 32-bits are sign extended or zero extended, depending on whether
2021 // they are signed or unsigned types.
2022 bool ExtendIntegerRetVal =
2023 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2024
2025 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2026 bool needTruncate = false;
2027 EVT TheLoadType = VTs[i];
2028 EVT EltType = Ins[i].VT;
2029 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2030 MVT PromotedVT;
2031
2032 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2033 TheLoadType = EVT(PromotedVT);
2034 EltType = EVT(PromotedVT);
2035 needTruncate = true;
2036 }
2037
2038 if (ExtendIntegerRetVal) {
2039 TheLoadType = MVT::i32;
2040 EltType = MVT::i32;
2041 needTruncate = true;
2042 } else if (TheLoadType.getSizeInBits() < 16) {
2043 if (VTs[i].isInteger())
2044 needTruncate = true;
2045 EltType = MVT::i16;
2046 }
2047
2048 // Record index of the very first element of the vector.
2049 if (VectorInfo[i] & PVF_FIRST) {
2050 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2051 VecIdx = i;
2052 }
2053
2054 LoadVTs.push_back(EltType);
2055
2056 if (VectorInfo[i] & PVF_LAST) {
2057 unsigned NumElts = LoadVTs.size();
2058 LoadVTs.push_back(MVT::Other);
2059 LoadVTs.push_back(MVT::Glue);
2061 switch (NumElts) {
2062 case 1:
2064 break;
2065 case 2:
2067 break;
2068 case 4:
2070 break;
2071 default:
2072 llvm_unreachable("Invalid vector info.");
2073 }
2074
2075 SDValue LoadOperands[] = {
2076 Chain, DAG.getConstant(1, dl, MVT::i32),
2077 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2078 SDValue RetVal = DAG.getMemIntrinsicNode(
2079 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2080 MachinePointerInfo(), EltAlign,
2082
2083 for (unsigned j = 0; j < NumElts; ++j) {
2084 ProxyRegOps.push_back(RetVal.getValue(j));
2085
2086 if (needTruncate)
2087 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2088 else
2089 ProxyRegTruncates.push_back(std::optional<MVT>());
2090 }
2091
2092 Chain = RetVal.getValue(NumElts);
2093 InGlue = RetVal.getValue(NumElts + 1);
2094
2095 // Cleanup
2096 VecIdx = -1;
2097 LoadVTs.clear();
2098 }
2099 }
2100 }
2101
2102 Chain =
2103 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2104 InGlue = Chain.getValue(1);
2105
2106 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2107 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2108 // dangling.
2109 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2110 SDValue Ret = DAG.getNode(
2112 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2113 { Chain, ProxyRegOps[i], InGlue }
2114 );
2115
2116 Chain = Ret.getValue(1);
2117 InGlue = Ret.getValue(2);
2118
2119 if (ProxyRegTruncates[i]) {
2120 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2121 }
2122
2123 InVals.push_back(Ret);
2124 }
2125
2126 // set isTailCall to false for now, until we figure out how to express
2127 // tail call optimization in PTX
2128 isTailCall = false;
2129 return Chain;
2130}
2131
2132// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2133// (see LegalizeDAG.cpp). This is slow and uses local memory.
2134// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2135SDValue
2136NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2137 SDNode *Node = Op.getNode();
2138 SDLoc dl(Node);
2140 unsigned NumOperands = Node->getNumOperands();
2141 for (unsigned i = 0; i < NumOperands; ++i) {
2142 SDValue SubOp = Node->getOperand(i);
2143 EVT VVT = SubOp.getNode()->getValueType(0);
2144 EVT EltVT = VVT.getVectorElementType();
2145 unsigned NumSubElem = VVT.getVectorNumElements();
2146 for (unsigned j = 0; j < NumSubElem; ++j) {
2147 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2148 DAG.getIntPtrConstant(j, dl)));
2149 }
2150 }
2151 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2152}
2153
2154// We can init constant f16x2 with a single .b32 move. Normally it
2155// would get lowered as two constant loads and vector-packing move.
2156// mov.b16 %h1, 0x4000;
2157// mov.b16 %h2, 0x3C00;
2158// mov.b32 %hh2, {%h2, %h1};
2159// Instead we want just a constant move:
2160// mov.b32 %hh2, 0x40003C00
2161//
2162// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
2163// generates good SASS in both cases.
2164SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2165 SelectionDAG &DAG) const {
2166 EVT VT = Op->getValueType(0);
2167 if (!(Isv2x16VT(VT)))
2168 return Op;
2169 APInt E0;
2170 APInt E1;
2171 if (VT == MVT::v2f16 || VT == MVT::v2bf16) {
2172 if (!(isa<ConstantFPSDNode>(Op->getOperand(0)) &&
2173 isa<ConstantFPSDNode>(Op->getOperand(1))))
2174 return Op;
2175
2176 E0 = cast<ConstantFPSDNode>(Op->getOperand(0))
2177 ->getValueAPF()
2178 .bitcastToAPInt();
2179 E1 = cast<ConstantFPSDNode>(Op->getOperand(1))
2180 ->getValueAPF()
2181 .bitcastToAPInt();
2182 } else {
2183 assert(VT == MVT::v2i16);
2184 if (!(isa<ConstantSDNode>(Op->getOperand(0)) &&
2185 isa<ConstantSDNode>(Op->getOperand(1))))
2186 return Op;
2187
2188 E0 = cast<ConstantSDNode>(Op->getOperand(0))->getAPIntValue();
2189 E1 = cast<ConstantSDNode>(Op->getOperand(1))->getAPIntValue();
2190 }
2191 SDValue Const =
2192 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
2193 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2194}
2195
2196SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2197 SelectionDAG &DAG) const {
2198 SDValue Index = Op->getOperand(1);
2199 // Constant index will be matched by tablegen.
2200 if (isa<ConstantSDNode>(Index.getNode()))
2201 return Op;
2202
2203 // Extract individual elements and select one of them.
2204 SDValue Vector = Op->getOperand(0);
2205 EVT VectorVT = Vector.getValueType();
2206 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2207 EVT EltVT = VectorVT.getVectorElementType();
2208
2209 SDLoc dl(Op.getNode());
2210 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2211 DAG.getIntPtrConstant(0, dl));
2212 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2213 DAG.getIntPtrConstant(1, dl));
2214 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2216}
2217
2218/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2219/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2220/// amount, or
2221/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2222/// amount.
2223SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2224 SelectionDAG &DAG) const {
2225 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2226 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2227
2228 EVT VT = Op.getValueType();
2229 unsigned VTBits = VT.getSizeInBits();
2230 SDLoc dl(Op);
2231 SDValue ShOpLo = Op.getOperand(0);
2232 SDValue ShOpHi = Op.getOperand(1);
2233 SDValue ShAmt = Op.getOperand(2);
2234 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2235
2236 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2237 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2238 // {dHi, dLo} = {aHi, aLo} >> Amt
2239 // dHi = aHi >> Amt
2240 // dLo = shf.r.clamp aLo, aHi, Amt
2241
2242 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2243 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2244 ShAmt);
2245
2246 SDValue Ops[2] = { Lo, Hi };
2247 return DAG.getMergeValues(Ops, dl);
2248 }
2249 else {
2250 // {dHi, dLo} = {aHi, aLo} >> Amt
2251 // - if (Amt>=size) then
2252 // dLo = aHi >> (Amt-size)
2253 // dHi = aHi >> Amt (this is either all 0 or all 1)
2254 // else
2255 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2256 // dHi = aHi >> Amt
2257
2258 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2259 DAG.getConstant(VTBits, dl, MVT::i32),
2260 ShAmt);
2261 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2262 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2263 DAG.getConstant(VTBits, dl, MVT::i32));
2264 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2265 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2266 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2267
2268 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2269 DAG.getConstant(VTBits, dl, MVT::i32),
2270 ISD::SETGE);
2271 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2272 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2273
2274 SDValue Ops[2] = { Lo, Hi };
2275 return DAG.getMergeValues(Ops, dl);
2276 }
2277}
2278
2279/// LowerShiftLeftParts - Lower SHL_PARTS, which
2280/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2281/// amount, or
2282/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2283/// amount.
2284SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2285 SelectionDAG &DAG) const {
2286 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2287 assert(Op.getOpcode() == ISD::SHL_PARTS);
2288
2289 EVT VT = Op.getValueType();
2290 unsigned VTBits = VT.getSizeInBits();
2291 SDLoc dl(Op);
2292 SDValue ShOpLo = Op.getOperand(0);
2293 SDValue ShOpHi = Op.getOperand(1);
2294 SDValue ShAmt = Op.getOperand(2);
2295
2296 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2297 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2298 // {dHi, dLo} = {aHi, aLo} << Amt
2299 // dHi = shf.l.clamp aLo, aHi, Amt
2300 // dLo = aLo << Amt
2301
2302 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2303 ShAmt);
2304 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2305
2306 SDValue Ops[2] = { Lo, Hi };
2307 return DAG.getMergeValues(Ops, dl);
2308 }
2309 else {
2310 // {dHi, dLo} = {aHi, aLo} << Amt
2311 // - if (Amt>=size) then
2312 // dLo = aLo << Amt (all 0)
2313 // dLo = aLo << (Amt-size)
2314 // else
2315 // dLo = aLo << Amt
2316 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2317
2318 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2319 DAG.getConstant(VTBits, dl, MVT::i32),
2320 ShAmt);
2321 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2322 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2323 DAG.getConstant(VTBits, dl, MVT::i32));
2324 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2325 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2326 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2327
2328 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2329 DAG.getConstant(VTBits, dl, MVT::i32),
2330 ISD::SETGE);
2331 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2332 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2333
2334 SDValue Ops[2] = { Lo, Hi };
2335 return DAG.getMergeValues(Ops, dl);
2336 }
2337}
2338
2339SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2340 EVT VT = Op.getValueType();
2341
2342 if (VT == MVT::f32)
2343 return LowerFROUND32(Op, DAG);
2344
2345 if (VT == MVT::f64)
2346 return LowerFROUND64(Op, DAG);
2347
2348 llvm_unreachable("unhandled type");
2349}
2350
2351// This is the the rounding method used in CUDA libdevice in C like code:
2352// float roundf(float A)
2353// {
2354// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2355// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2356// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2357// }
2358SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2359 SelectionDAG &DAG) const {
2360 SDLoc SL(Op);
2361 SDValue A = Op.getOperand(0);
2362 EVT VT = Op.getValueType();
2363
2364 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2365
2366 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2367 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2368 const int SignBitMask = 0x80000000;
2369 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2370 DAG.getConstant(SignBitMask, SL, MVT::i32));
2371 const int PointFiveInBits = 0x3F000000;
2372 SDValue PointFiveWithSignRaw =
2373 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2374 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2375 SDValue PointFiveWithSign =
2376 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2377 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2378 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2379
2380 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2381 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2382 SDValue IsLarge =
2383 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2384 ISD::SETOGT);
2385 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2386
2387 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2388 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2389 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2390 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2391 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2392}
2393
2394// The implementation of round(double) is similar to that of round(float) in
2395// that they both separate the value range into three regions and use a method
2396// specific to the region to round the values. However, round(double) first
2397// calculates the round of the absolute value and then adds the sign back while
2398// round(float) directly rounds the value with sign.
2399SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2400 SelectionDAG &DAG) const {
2401 SDLoc SL(Op);
2402 SDValue A = Op.getOperand(0);
2403 EVT VT = Op.getValueType();
2404
2405 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2406
2407 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2408 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2409 DAG.getConstantFP(0.5, SL, VT));
2410 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2411
2412 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2413 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2414 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2415 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2416 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2417 DAG.getConstantFP(0, SL, VT),
2418 RoundedA);
2419
2420 // Add sign to rounded_A
2421 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2422 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2423
2424 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2425 SDValue IsLarge =
2426 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2427 ISD::SETOGT);
2428 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2429}
2430
2432 SDLoc DL(Op);
2433 if (Op.getValueType() != MVT::v2i16)
2434 return Op;
2435 EVT EltVT = Op.getValueType().getVectorElementType();
2436 SmallVector<SDValue> VecElements;
2437 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2438 SmallVector<SDValue> ScalarArgs;
2439 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2440 [&](const SDUse &O) {
2441 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2442 O.get(), DAG.getIntPtrConstant(I, DL));
2443 });
2444 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2445 }
2446 SDValue V =
2447 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2448 return V;
2449}
2450
2451SDValue
2453 switch (Op.getOpcode()) {
2454 case ISD::RETURNADDR:
2455 return SDValue();
2456 case ISD::FRAMEADDR:
2457 return SDValue();
2458 case ISD::GlobalAddress:
2459 return LowerGlobalAddress(Op, DAG);
2461 return Op;
2462 case ISD::BUILD_VECTOR:
2463 return LowerBUILD_VECTOR(Op, DAG);
2465 return Op;
2467 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2469 return LowerCONCAT_VECTORS(Op, DAG);
2470 case ISD::STORE:
2471 return LowerSTORE(Op, DAG);
2472 case ISD::LOAD:
2473 return LowerLOAD(Op, DAG);
2474 case ISD::SHL_PARTS:
2475 return LowerShiftLeftParts(Op, DAG);
2476 case ISD::SRA_PARTS:
2477 case ISD::SRL_PARTS:
2478 return LowerShiftRightParts(Op, DAG);
2479 case ISD::SELECT:
2480 return LowerSelect(Op, DAG);
2481 case ISD::FROUND:
2482 return LowerFROUND(Op, DAG);
2483 case ISD::VAARG:
2484 return LowerVAARG(Op, DAG);
2485 case ISD::VASTART:
2486 return LowerVASTART(Op, DAG);
2487 case ISD::ABS:
2488 case ISD::SMIN:
2489 case ISD::SMAX:
2490 case ISD::UMIN:
2491 case ISD::UMAX:
2492 case ISD::ADD:
2493 case ISD::SUB:
2494 case ISD::MUL:
2495 case ISD::SHL:
2496 case ISD::SREM:
2497 case ISD::UREM:
2498 return LowerVectorArith(Op, DAG);
2499 default:
2500 llvm_unreachable("Custom lowering not defined for operation");
2501 }
2502}
2503
2504// This function is almost a copy of SelectionDAG::expandVAArg().
2505// The only diff is that this one produces loads from local address space.
2506SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2507 const TargetLowering *TLI = STI.getTargetLowering();
2508 SDLoc DL(Op);
2509
2510 SDNode *Node = Op.getNode();
2511 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2512 EVT VT = Node->getValueType(0);
2513 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2514 SDValue Tmp1 = Node->getOperand(0);
2515 SDValue Tmp2 = Node->getOperand(1);
2516 const MaybeAlign MA(Node->getConstantOperandVal(3));
2517
2518 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2519 Tmp1, Tmp2, MachinePointerInfo(V));
2520 SDValue VAList = VAListLoad;
2521
2522 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2523 VAList = DAG.getNode(
2524 ISD::ADD, DL, VAList.getValueType(), VAList,
2525 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2526
2527 VAList = DAG.getNode(
2528 ISD::AND, DL, VAList.getValueType(), VAList,
2529 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2530 }
2531
2532 // Increment the pointer, VAList, to the next vaarg
2533 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2535 DL, VAList.getValueType()));
2536
2537 // Store the incremented VAList to the legalized pointer
2538 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2540
2541 const Value *SrcV =
2543
2544 // Load the actual argument out of the pointer VAList
2545 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2546}
2547
2548SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2549 const TargetLowering *TLI = STI.getTargetLowering();
2550 SDLoc DL(Op);
2551 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2552
2553 // Store the address of unsized array <function>_vararg[] in the ap object.
2554 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2555 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2556
2557 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2558 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2559 MachinePointerInfo(SV));
2560}
2561
2562SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2563 SDValue Op0 = Op->getOperand(0);
2564 SDValue Op1 = Op->getOperand(1);
2565 SDValue Op2 = Op->getOperand(2);
2566 SDLoc DL(Op.getNode());
2567
2568 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2569
2570 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2571 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2572 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2573 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2574
2575 return Trunc;
2576}
2577
2578SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2579 if (Op.getValueType() == MVT::i1)
2580 return LowerLOADi1(Op, DAG);
2581
2582 // v2f16/v2bf16/v2i16 are legal, so we can't rely on legalizer to handle
2583 // unaligned loads and have to handle it here.
2584 if (Isv2x16VT(Op.getValueType())) {
2585 LoadSDNode *Load = cast<LoadSDNode>(Op);
2586 EVT MemVT = Load->getMemoryVT();
2588 MemVT, *Load->getMemOperand())) {
2589 SDValue Ops[2];
2590 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2591 return DAG.getMergeValues(Ops, SDLoc(Op));
2592 }
2593 }
2594
2595 return SDValue();
2596}
2597
2598// v = ld i1* addr
2599// =>
2600// v1 = ld i8* addr (-> i16)
2601// v = trunc i16 to i1
2602SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2603 SDNode *Node = Op.getNode();
2604 LoadSDNode *LD = cast<LoadSDNode>(Node);
2605 SDLoc dl(Node);
2606 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2607 assert(Node->getValueType(0) == MVT::i1 &&
2608 "Custom lowering for i1 load only");
2609 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2610 LD->getPointerInfo(), LD->getAlign(),
2611 LD->getMemOperand()->getFlags());
2612 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2613 // The legalizer (the caller) is expecting two values from the legalized
2614 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2615 // in LegalizeDAG.cpp which also uses MergeValues.
2616 SDValue Ops[] = { result, LD->getChain() };
2617 return DAG.getMergeValues(Ops, dl);
2618}
2619
2620SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2621 StoreSDNode *Store = cast<StoreSDNode>(Op);
2622 EVT VT = Store->getMemoryVT();
2623
2624 if (VT == MVT::i1)
2625 return LowerSTOREi1(Op, DAG);
2626
2627 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2628 // stores and have to handle it here.
2629 if (Isv2x16VT(VT) &&
2631 VT, *Store->getMemOperand()))
2632 return expandUnalignedStore(Store, DAG);
2633
2634 // v2f16, v2bf16 and v2i16 don't need special handling.
2635 if (Isv2x16VT(VT))
2636 return SDValue();
2637
2638 if (VT.isVector())
2639 return LowerSTOREVector(Op, DAG);
2640
2641 return SDValue();
2642}
2643
2644SDValue
2645NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2646 SDNode *N = Op.getNode();
2647 SDValue Val = N->getOperand(1);
2648 SDLoc DL(N);
2649 EVT ValVT = Val.getValueType();
2650
2651 if (ValVT.isVector()) {
2652 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2653 // legal. We can (and should) split that into 2 stores of <2 x double> here
2654 // but I'm leaving that as a TODO for now.
2655 if (!ValVT.isSimple())
2656 return SDValue();
2657 switch (ValVT.getSimpleVT().SimpleTy) {
2658 default:
2659 return SDValue();
2660 case MVT::v2i8:
2661 case MVT::v2i16:
2662 case MVT::v2i32:
2663 case MVT::v2i64:
2664 case MVT::v2f16:
2665 case MVT::v2bf16:
2666 case MVT::v2f32:
2667 case MVT::v2f64:
2668 case MVT::v4i8:
2669 case MVT::v4i16:
2670 case MVT::v4i32:
2671 case MVT::v4f16:
2672 case MVT::v4bf16:
2673 case MVT::v4f32:
2674 case MVT::v8f16: // <4 x f16x2>
2675 case MVT::v8bf16: // <4 x bf16x2>
2676 case MVT::v8i16: // <4 x i16x2>
2677 // This is a "native" vector type
2678 break;
2679 }
2680
2681 MemSDNode *MemSD = cast<MemSDNode>(N);
2682 const DataLayout &TD = DAG.getDataLayout();
2683
2684 Align Alignment = MemSD->getAlign();
2685 Align PrefAlign =
2686 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2687 if (Alignment < PrefAlign) {
2688 // This store is not sufficiently aligned, so bail out and let this vector
2689 // store be scalarized. Note that we may still be able to emit smaller
2690 // vector stores. For example, if we are storing a <4 x float> with an
2691 // alignment of 8, this check will fail but the legalizer will try again
2692 // with 2 x <2 x float>, which will succeed with an alignment of 8.
2693 return SDValue();
2694 }
2695
2696 unsigned Opcode = 0;
2697 EVT EltVT = ValVT.getVectorElementType();
2698 unsigned NumElts = ValVT.getVectorNumElements();
2699
2700 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2701 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2702 // stored type to i16 and propagate the "real" type as the memory type.
2703 bool NeedExt = false;
2704 if (EltVT.getSizeInBits() < 16)
2705 NeedExt = true;
2706
2707 bool StoreF16x2 = false;
2708 switch (NumElts) {
2709 default:
2710 return SDValue();
2711 case 2:
2712 Opcode = NVPTXISD::StoreV2;
2713 break;
2714 case 4:
2715 Opcode = NVPTXISD::StoreV4;
2716 break;
2717 case 8:
2718 // v8f16 is a special case. PTX doesn't have st.v8.f16
2719 // instruction. Instead, we split the vector into v2f16 chunks and
2720 // store them with st.v4.b32.
2721 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
2722 Opcode = NVPTXISD::StoreV4;
2723 StoreF16x2 = true;
2724 break;
2725 }
2726
2728
2729 // First is the chain
2730 Ops.push_back(N->getOperand(0));
2731
2732 if (StoreF16x2) {
2733 // Combine f16,f16 -> v2f16
2734 NumElts /= 2;
2735 for (unsigned i = 0; i < NumElts; ++i) {
2736 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2737 DAG.getIntPtrConstant(i * 2, DL));
2738 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2739 DAG.getIntPtrConstant(i * 2 + 1, DL));
2740 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
2741 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
2742 Ops.push_back(V2);
2743 }
2744 } else {
2745 // Then the split values
2746 for (unsigned i = 0; i < NumElts; ++i) {
2747 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2748 DAG.getIntPtrConstant(i, DL));
2749 if (NeedExt)
2750 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2751 Ops.push_back(ExtVal);
2752 }
2753 }
2754
2755 // Then any remaining arguments
2756 Ops.append(N->op_begin() + 2, N->op_end());
2757
2758 SDValue NewSt =
2759 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2760 MemSD->getMemoryVT(), MemSD->getMemOperand());
2761
2762 // return DCI.CombineTo(N, NewSt, true);
2763 return NewSt;
2764 }
2765
2766 return SDValue();
2767}
2768
2769// st i1 v, addr
2770// =>
2771// v1 = zxt v to i16
2772// st.u8 i16, addr
2773SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2774 SDNode *Node = Op.getNode();
2775 SDLoc dl(Node);
2776 StoreSDNode *ST = cast<StoreSDNode>(Node);
2777 SDValue Tmp1 = ST->getChain();
2778 SDValue Tmp2 = ST->getBasePtr();
2779 SDValue Tmp3 = ST->getValue();
2780 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2781 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2782 SDValue Result =
2783 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2784 ST->getAlign(), ST->getMemOperand()->getFlags());
2785 return Result;
2786}
2787
2788// This creates target external symbol for a function parameter.
2789// Name of the symbol is composed from its index and the function name.
2790// Negative index corresponds to special parameter (unsized array) used for
2791// passing variable arguments.
2792SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
2793 EVT v) const {
2794 StringRef SavedStr = nvTM->getStrPool().save(
2796 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
2797}
2798
2800 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2801 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2802 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2804 const DataLayout &DL = DAG.getDataLayout();
2805 auto PtrVT = getPointerTy(DAG.getDataLayout());
2806
2807 const Function *F = &MF.getFunction();
2808 const AttributeList &PAL = F->getAttributes();
2809 const TargetLowering *TLI = STI.getTargetLowering();
2810
2811 SDValue Root = DAG.getRoot();
2812 std::vector<SDValue> OutChains;
2813
2814 bool isABI = (STI.getSmVersion() >= 20);
2815 assert(isABI && "Non-ABI compilation is not supported");
2816 if (!isABI)
2817 return Chain;
2818
2819 std::vector<Type *> argTypes;
2820 std::vector<const Argument *> theArgs;
2821 for (const Argument &I : F->args()) {
2822 theArgs.push_back(&I);
2823 argTypes.push_back(I.getType());
2824 }
2825 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2826 // Ins.size() will be larger
2827 // * if there is an aggregate argument with multiple fields (each field
2828 // showing up separately in Ins)
2829 // * if there is a vector argument with more than typical vector-length
2830 // elements (generally if more than 4) where each vector element is
2831 // individually present in Ins.
2832 // So a different index should be used for indexing into Ins.
2833 // See similar issue in LowerCall.
2834 unsigned InsIdx = 0;
2835
2836 int idx = 0;
2837 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2838 Type *Ty = argTypes[i];
2839
2840 if (theArgs[i]->use_empty()) {
2841 // argument is dead
2842 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
2843 SmallVector<EVT, 16> vtparts;
2844
2845 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2846 if (vtparts.empty())
2847 report_fatal_error("Empty parameter types are not supported");
2848
2849 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2850 ++parti) {
2851 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2852 ++InsIdx;
2853 }
2854 if (vtparts.size() > 0)
2855 --InsIdx;
2856 continue;
2857 }
2858 if (Ty->isVectorTy()) {
2859 EVT ObjectVT = getValueType(DL, Ty);
2860 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2861 for (unsigned parti = 0; parti < NumRegs; ++parti) {
2862 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2863 ++InsIdx;
2864 }
2865 if (NumRegs > 0)
2866 --InsIdx;
2867 continue;
2868 }
2869 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2870 continue;
2871 }
2872
2873 // In the following cases, assign a node order of "idx+1"
2874 // to newly created nodes. The SDNodes for params have to
2875 // appear in the same order as their order of appearance
2876 // in the original function. "idx+1" holds that order.
2877 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
2878 bool aggregateIsPacked = false;
2879 if (StructType *STy = dyn_cast<StructType>(Ty))
2880 aggregateIsPacked = STy->isPacked();
2881
2884 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2885 if (VTs.empty())
2886 report_fatal_error("Empty parameter types are not supported");
2887
2888 auto VectorInfo =
2889 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
2890
2891 SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2892 int VecIdx = -1; // Index of the first element of the current vector.
2893 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2894 if (VectorInfo[parti] & PVF_FIRST) {
2895 assert(VecIdx == -1 && "Orphaned vector.");
2896 VecIdx = parti;
2897 }
2898
2899 // That's the last element of this store op.
2900 if (VectorInfo[parti] & PVF_LAST) {
2901 unsigned NumElts = parti - VecIdx + 1;
2902 EVT EltVT = VTs[parti];
2903 // i1 is loaded/stored as i8.
2904 EVT LoadVT = EltVT;
2905 if (EltVT == MVT::i1)
2906 LoadVT = MVT::i8;
2907 else if (Isv2x16VT(EltVT))
2908 // getLoad needs a vector type, but it can't handle
2909 // vectors which contain v2f16 or v2bf16 elements. So we must load
2910 // using i32 here and then bitcast back.
2911 LoadVT = MVT::i32;
2912
2913 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2914 SDValue VecAddr =
2915 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2916 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2918 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2919 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
2920 MachinePointerInfo(srcValue),
2921 MaybeAlign(aggregateIsPacked ? 1 : 0),
2924 if (P.getNode())
2925 P.getNode()->setIROrder(idx + 1);
2926 for (unsigned j = 0; j < NumElts; ++j) {
2927 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2928 DAG.getIntPtrConstant(j, dl));
2929 // We've loaded i1 as an i8 and now must truncate it back to i1
2930 if (EltVT == MVT::i1)
2931 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2932 // v2f16 was loaded as an i32. Now we must bitcast it back.
2933 else if (Isv2x16VT(EltVT))
2934 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
2935
2936 // If a promoted integer type is used, truncate down to the original
2937 MVT PromotedVT;
2938 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
2939 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
2940 }
2941
2942 // Extend the element if necessary (e.g. an i8 is loaded
2943 // into an i16 register)
2944 if (Ins[InsIdx].VT.isInteger() &&
2945 Ins[InsIdx].VT.getFixedSizeInBits() >
2946 LoadVT.getFixedSizeInBits()) {
2947 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2949 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2950 }
2951 InVals.push_back(Elt);
2952 }
2953
2954 // Reset vector tracking state.
2955 VecIdx = -1;
2956 }
2957 ++InsIdx;
2958 }
2959 if (VTs.size() > 0)
2960 --InsIdx;
2961 continue;
2962 }
2963
2964 // Param has ByVal attribute
2965 // Return MoveParam(param symbol).
2966 // Ideally, the param symbol can be returned directly,
2967 // but when SDNode builder decides to use it in a CopyToReg(),
2968 // machine instruction fails because TargetExternalSymbol
2969 // (not lowered) is target dependent, and CopyToReg assumes
2970 // the source is lowered.
2971 EVT ObjectVT = getValueType(DL, Ty);
2972 assert(ObjectVT == Ins[InsIdx].VT &&
2973 "Ins type did not match function type");
2974 SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2975 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2976 if (p.getNode())
2977 p.getNode()->setIROrder(idx + 1);
2978 InVals.push_back(p);
2979 }
2980
2981 if (!OutChains.empty())
2982 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2983
2984 return Chain;
2985}
2986
2987SDValue
2989 bool isVarArg,
2991 const SmallVectorImpl<SDValue> &OutVals,
2992 const SDLoc &dl, SelectionDAG &DAG) const {
2993 const MachineFunction &MF = DAG.getMachineFunction();
2994 const Function &F = MF.getFunction();
2996
2997 bool isABI = (STI.getSmVersion() >= 20);
2998 assert(isABI && "Non-ABI compilation is not supported");
2999 if (!isABI)
3000 return Chain;
3001
3002 const DataLayout &DL = DAG.getDataLayout();
3003 SmallVector<SDValue, 16> PromotedOutVals;
3006 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3007 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3008
3009 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3010 SDValue PromotedOutVal = OutVals[i];
3011 MVT PromotedVT;
3012 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3013 VTs[i] = EVT(PromotedVT);
3014 }
3015 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3017 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3018 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3019 }
3020 PromotedOutVals.push_back(PromotedOutVal);
3021 }
3022
3023 auto VectorInfo = VectorizePTXValueVTs(
3024 VTs, Offsets,
3026 : Align(1));
3027
3028 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3029 // 32-bits are sign extended or zero extended, depending on whether
3030 // they are signed or unsigned types.
3031 bool ExtendIntegerRetVal =
3032 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3033
3034 SmallVector<SDValue, 6> StoreOperands;
3035 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3036 // New load/store. Record chain and offset operands.
3037 if (VectorInfo[i] & PVF_FIRST) {
3038 assert(StoreOperands.empty() && "Orphaned operand list.");
3039 StoreOperands.push_back(Chain);
3040 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3041 }
3042
3043 SDValue OutVal = OutVals[i];
3044 SDValue RetVal = PromotedOutVals[i];
3045
3046 if (ExtendIntegerRetVal) {
3047 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3049 dl, MVT::i32, RetVal);
3050 } else if (OutVal.getValueSizeInBits() < 16) {
3051 // Use 16-bit registers for small load-stores as it's the
3052 // smallest general purpose register size supported by NVPTX.
3053 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3054 }
3055
3056 // Record the value to return.
3057 StoreOperands.push_back(RetVal);
3058
3059 // That's the last element of this store op.
3060 if (VectorInfo[i] & PVF_LAST) {
3062 unsigned NumElts = StoreOperands.size() - 2;
3063 switch (NumElts) {
3064 case 1:
3066 break;
3067 case 2:
3069 break;
3070 case 4:
3072 break;
3073 default:
3074 llvm_unreachable("Invalid vector info.");
3075 }
3076
3077 // Adjust type of load/store op if we've extended the scalar
3078 // return value.
3079 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3080 Chain = DAG.getMemIntrinsicNode(
3081 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3083 // Cleanup vector state.
3084 StoreOperands.clear();
3085 }
3086 }
3087
3088 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3089}
3090
3092 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
3093 SelectionDAG &DAG) const {
3094 if (Constraint.length() > 1)
3095 return;
3096 else
3097 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3098}
3099
3100static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3101 switch (Intrinsic) {
3102 default:
3103 return 0;
3104
3105 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3107 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3109 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3111 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3113 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3114 return NVPTXISD::Tex1DS32S32;
3115 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3117 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3119 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3121 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3122 return NVPTXISD::Tex1DU32S32;
3123 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3125 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3127 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3129
3130 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3132 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3134 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3136 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3138 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3140 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3142 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3144 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3146 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3148 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3150 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3152 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3154
3155 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3157 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3159 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3161 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3163 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3164 return NVPTXISD::Tex2DS32S32;
3165 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3167 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3169 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3171 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3172 return NVPTXISD::Tex2DU32S32;
3173 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3175 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3177 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3179
3180 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3182 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3184 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3186 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3188 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3190 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3192 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3194 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3196 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3198 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3200 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3202 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3204
3205 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3207 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3209 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3211 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3213 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3214 return NVPTXISD::Tex3DS32S32;
3215 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3217 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3219 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3221 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3222 return NVPTXISD::Tex3DU32S32;
3223 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3225 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3227 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3229
3230 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3232 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3234 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3236 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3238 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3240 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3242
3243 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3245 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3247 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3249 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3251 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3253 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3255
3256 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3258 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3260 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3262 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3264 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3266 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3268 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3270 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3272 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3274 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3276 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3278 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3280
3281 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3283 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3285 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3287 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3289 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3291 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3293 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3295 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3297 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3299 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3301 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3303 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3305
3306 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3308 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3310 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3312 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3314 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3316 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3318 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3320 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3322 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3324 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3326 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3328 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3330
3331 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3333 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3335 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3337 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3339 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3341 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3343 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3345 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3347 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3349 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3351 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3353 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3355
3356 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3358 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3360 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3362 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3364 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3366 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3368 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3370 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3372 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3374 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3376 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3378 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3380
3381 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3383 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3385 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3387 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3389 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3391 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3393 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3395 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3397 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3399 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3401 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3403 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3405
3406 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3408 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3410 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3412 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3414 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3416 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3418
3419 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3421 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3423 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3425 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3427 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3429 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3431
3432 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3434 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3436 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3438 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3440 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3442 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3444 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3446 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3448 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3450 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3452 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3454 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3456 }
3457}
3458
3459static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3460 switch (Intrinsic) {
3461 default:
3462 return 0;
3463 case Intrinsic::nvvm_suld_1d_i8_clamp:
3465 case Intrinsic::nvvm_suld_1d_i16_clamp:
3467 case Intrinsic::nvvm_suld_1d_i32_clamp:
3469 case Intrinsic::nvvm_suld_1d_i64_clamp:
3471 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3473 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3475 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3477 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3479 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3481 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3483 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3485 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3487 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3489 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3491 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3493 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3495 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3497 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3499 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3501 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3503 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3505 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3507 case Intrinsic::nvvm_suld_2d_i8_clamp:
3509 case Intrinsic::nvvm_suld_2d_i16_clamp:
3511 case Intrinsic::nvvm_suld_2d_i32_clamp:
3513 case Intrinsic::nvvm_suld_2d_i64_clamp:
3515 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3517 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3519 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3521 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3523 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3525 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3527 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3529 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3531 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3533 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3535 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3537 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3539 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3541 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3543 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3545 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3547 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3549 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3551 case Intrinsic::nvvm_suld_3d_i8_clamp:
3553 case Intrinsic::nvvm_suld_3d_i16_clamp:
3555 case Intrinsic::nvvm_suld_3d_i32_clamp:
3557 case Intrinsic::nvvm_suld_3d_i64_clamp:
3559 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3561 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3563 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3565 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3567 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3569 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3571 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3573 case Intrinsic::nvvm_suld_1d_i8_trap:
3575 case Intrinsic::nvvm_suld_1d_i16_trap:
3577 case Intrinsic::nvvm_suld_1d_i32_trap:
3579 case Intrinsic::nvvm_suld_1d_i64_trap:
3581 case Intrinsic::nvvm_suld_1d_v2i8_trap:
3583 case Intrinsic::nvvm_suld_1d_v2i16_trap:
3585 case Intrinsic::nvvm_suld_1d_v2i32_trap:
3587 case Intrinsic::nvvm_suld_1d_v2i64_trap:
3589 case Intrinsic::nvvm_suld_1d_v4i8_trap:
3591 case Intrinsic::nvvm_suld_1d_v4i16_trap:
3593 case Intrinsic::nvvm_suld_1d_v4i32_trap:
3595 case Intrinsic::nvvm_suld_1d_array_i8_trap:
3597 case Intrinsic::nvvm_suld_1d_array_i16_trap:
3599 case Intrinsic::nvvm_suld_1d_array_i32_trap:
3601 case Intrinsic::nvvm_suld_1d_array_i64_trap:
3603 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3605 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3607 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3609 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3611 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3613 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3615 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3617 case Intrinsic::nvvm_suld_2d_i8_trap:
3619 case Intrinsic::nvvm_suld_2d_i16_trap:
3621 case Intrinsic::nvvm_suld_2d_i32_trap:
3623 case Intrinsic::nvvm_suld_2d_i64_trap:
3625 case Intrinsic::nvvm_suld_2d_v2i8_trap:
3627 case Intrinsic::nvvm_suld_2d_v2i16_trap:
3629 case Intrinsic::nvvm_suld_2d_v2i32_trap:
3631 case Intrinsic::nvvm_suld_2d_v2i64_trap:
3633 case Intrinsic::nvvm_suld_2d_v4i8_trap:
3635 case Intrinsic::nvvm_suld_2d_v4i16_trap:
3637 case Intrinsic::nvvm_suld_2d_v4i32_trap:
3639 case Intrinsic::nvvm_suld_2d_array_i8_trap:
3641 case Intrinsic::nvvm_suld_2d_array_i16_trap:
3643 case Intrinsic::nvvm_suld_2d_array_i32_trap:
3645 case Intrinsic::nvvm_suld_2d_array_i64_trap:
3647 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3649 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3651 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3653 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3655 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3657 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3659 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3661 case Intrinsic::nvvm_suld_3d_i8_trap:
3663 case Intrinsic::nvvm_suld_3d_i16_trap:
3665 case Intrinsic::nvvm_suld_3d_i32_trap:
3667 case Intrinsic::nvvm_suld_3d_i64_trap:
3669 case Intrinsic::nvvm_suld_3d_v2i8_trap:
3671 case Intrinsic::nvvm_suld_3d_v2i16_trap:
3673 case Intrinsic::nvvm_suld_3d_v2i32_trap:
3675 case Intrinsic::nvvm_suld_3d_v2i64_trap:
3677 case Intrinsic::nvvm_suld_3d_v4i8_trap:
3679 case Intrinsic::nvvm_suld_3d_v4i16_trap:
3681 case Intrinsic::nvvm_suld_3d_v4i32_trap:
3683 case Intrinsic::nvvm_suld_1d_i8_zero:
3685 case Intrinsic::nvvm_suld_1d_i16_zero:
3687 case Intrinsic::nvvm_suld_1d_i32_zero:
3689 case Intrinsic::nvvm_suld_1d_i64_zero:
3691 case Intrinsic::nvvm_suld_1d_v2i8_zero:
3693 case Intrinsic::nvvm_suld_1d_v2i16_zero:
3695 case Intrinsic::nvvm_suld_1d_v2i32_zero:
3697 case Intrinsic::nvvm_suld_1d_v2i64_zero:
3699 case Intrinsic::nvvm_suld_1d_v4i8_zero:
3701 case Intrinsic::nvvm_suld_1d_v4i16_zero:
3703 case Intrinsic::nvvm_suld_1d_v4i32_zero:
3705 case Intrinsic::nvvm_suld_1d_array_i8_zero:
3707 case Intrinsic::nvvm_suld_1d_array_i16_zero:
3709 case Intrinsic::nvvm_suld_1d_array_i32_zero:
3711 case Intrinsic::nvvm_suld_1d_array_i64_zero:
3713 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3715 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3717 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3719 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3721 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3723 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3725 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3727 case Intrinsic::nvvm_suld_2d_i8_zero:
3729 case Intrinsic::nvvm_suld_2d_i16_zero:
3731 case Intrinsic::nvvm_suld_2d_i32_zero:
3733 case Intrinsic::nvvm_suld_2d_i64_zero:
3735 case Intrinsic::nvvm_suld_2d_v2i8_zero:
3737 case Intrinsic::nvvm_suld_2d_v2i16_zero:
3739 case Intrinsic::nvvm_suld_2d_v2i32_zero:
3741 case Intrinsic::nvvm_suld_2d_v2i64_zero:
3743 case Intrinsic::nvvm_suld_2d_v4i8_zero:
3745 case Intrinsic::nvvm_suld_2d_v4i16_zero:
3747 case Intrinsic::nvvm_suld_2d_v4i32_zero:
3749 case Intrinsic::nvvm_suld_2d_array_i8_zero:
3751 case Intrinsic::nvvm_suld_2d_array_i16_zero:
3753 case Intrinsic::nvvm_suld_2d_array_i32_zero:
3755 case Intrinsic::nvvm_suld_2d_array_i64_zero:
3757 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3759 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3761 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3763 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3765 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3767 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3769 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3771 case Intrinsic::nvvm_suld_3d_i8_zero:
3773 case Intrinsic::nvvm_suld_3d_i16_zero:
3775 case Intrinsic::nvvm_suld_3d_i32_zero:
3777 case Intrinsic::nvvm_suld_3d_i64_zero:
3779 case Intrinsic::nvvm_suld_3d_v2i8_zero:
3781 case Intrinsic::nvvm_suld_3d_v2i16_zero:
3783 case Intrinsic::nvvm_suld_3d_v2i32_zero:
3785 case Intrinsic::nvvm_suld_3d_v2i64_zero:
3787 case Intrinsic::nvvm_suld_3d_v4i8_zero:
3789 case Intrinsic::nvvm_suld_3d_v4i16_zero:
3791 case Intrinsic::nvvm_suld_3d_v4i32_zero:
3793 }
3794}
3795
3796// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3797// TgtMemIntrinsic
3798// because we need the information that is only available in the "Value" type
3799// of destination
3800// pointer. In particular, the address space information.
3802 IntrinsicInfo &Info, const CallInst &I,
3803 MachineFunction &MF, unsigned Intrinsic) const {
3804 switch (Intrinsic) {
3805 default:
3806 return false;
3807 case Intrinsic::nvvm_match_all_sync_i32p:
3808 case Intrinsic::nvvm_match_all_sync_i64p:
3810 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3811 // in order to model data exchange with other threads, but perform no real
3812 // memory accesses.
3813 Info.memVT = MVT::i1;
3814
3815 // Our result depends on both our and other thread's arguments.
3817 return true;
3818 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3819 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3820 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3821 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3822 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3823 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3824 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3825 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3826 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3827 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3828 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3829 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3830 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3831 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3832 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3833 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3834 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3835 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3836 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3837 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3838 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3839 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3840 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3841 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3843 Info.memVT = MVT::v8f16;
3844 Info.ptrVal = I.getArgOperand(0);
3845 Info.offset = 0;
3847 Info.align = Align(16);
3848 return true;
3849 }
3850 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3851 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3852 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3853 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3854 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3855 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3856 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3857 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3858 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3859 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3860 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3861 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3862 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3863 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3864 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3865 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3866 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3867 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3868 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3869 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3870 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3871 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3872 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3873 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3875 Info.memVT = MVT::v2i32;
3876 Info.ptrVal = I.getArgOperand(0);
3877 Info.offset = 0;
3879 Info.align = Align(8);
3880 return true;
3881 }
3882
3883 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3884 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3885 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3886 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3887 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3888 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3889 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3890 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3891 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3892 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3893 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3894 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3895 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3896 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3897 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3898 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3899
3900 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3901 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3902 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3903 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3904 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3905 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3906 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3907 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3908 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3909 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3910 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3911 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3912 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3913 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3914 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3915 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3916 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3917 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3919 Info.memVT = MVT::v4i32;
3920 Info.ptrVal = I.getArgOperand(0);
3921 Info.offset = 0;
3923 Info.align = Align(16);
3924 return true;
3925 }
3926
3927 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3928 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3929 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3930 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3931 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3932 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3933 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3934 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3935
3936 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3937 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3938 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3939 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3940 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3941 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3942 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3943 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3944 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3945 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3946 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3947 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3948 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3949 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3950 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3951 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3952 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3953 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3954 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3955 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3956 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3957 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3959 Info.memVT = MVT::i32;
3960 Info.ptrVal = I.getArgOperand(0);
3961 Info.offset = 0;
3963 Info.align = Align(4);
3964 return true;
3965 }
3966
3967 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3968 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3969 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3970 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3971 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3972 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3973 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3974 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3975 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3976 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3977 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3978 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3980 Info.memVT = MVT::v4f16;
3981 Info.ptrVal = I.getArgOperand(0);
3982 Info.offset = 0;
3984 Info.align = Align(16);
3985 return true;
3986 }
3987
3988 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3989 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3990 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3991 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3992 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3993 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3994 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3995 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3996 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3997 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3998 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3999 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4000 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4001 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4002 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4003 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4005 Info.memVT = MVT::v8f32;
4006 Info.ptrVal = I.getArgOperand(0);
4007 Info.offset = 0;
4009 Info.align = Align(16);
4010 return true;
4011 }
4012
4013 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4014 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4015 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4016 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4017
4018 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4019 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4020 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4021 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4022
4023 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4024 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4025 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4026 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4027 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4028 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4029 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4030 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4031 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4032 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4033 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4034 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4036 Info.memVT = MVT::v8i32;
4037 Info.ptrVal = I.getArgOperand(0);
4038 Info.offset = 0;
4040 Info.align = Align(16);
4041 return true;
4042 }
4043
4044 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4045 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4046 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4047 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4048 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4049 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4050 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4051 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4052 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4053 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4055 Info.memVT = MVT::v2i32;
4056 Info.ptrVal = I.getArgOperand(0);
4057 Info.offset = 0;
4059 Info.align = Align(8);
4060 return true;
4061 }
4062
4063 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4064 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4065 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4066 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4067
4068 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4069 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4070 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4071 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4073 Info.memVT = MVT::f64;
4074 Info.ptrVal = I.getArgOperand(0);
4075 Info.offset = 0;
4077 Info.align = Align(8);
4078 return true;
4079 }
4080
4081 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4082 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4083 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4084 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4086 Info.memVT = MVT::v2f64;
4087 Info.ptrVal = I.getArgOperand(0);
4088 Info.offset = 0;
4090 Info.align = Align(16);
4091 return true;
4092 }
4093
4094 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4095 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4096 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4097 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4098 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4099 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4100 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4101 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4102 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4103 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4104 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4105 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4107 Info.memVT = MVT::v4f16;
4108 Info.ptrVal = I.getArgOperand(0);
4109 Info.offset = 0;
4111 Info.align = Align(16);
4112 return true;
4113 }
4114
4115 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4116 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4117 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4118 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4119 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4120 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4121 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4122 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4123 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4124 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4125 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4126 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4127 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4128 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4129 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4130 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4132 Info.memVT = MVT::v8f32;
4133 Info.ptrVal = I.getArgOperand(0);
4134 Info.offset = 0;
4136 Info.align = Align(16);
4137 return true;
4138 }
4139
4140 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4141 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4142 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4143 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4144 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4145 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4146 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4147 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4148 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4149 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4150 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4151 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4153 Info.memVT = MVT::v8i32;
4154 Info.ptrVal = I.getArgOperand(0);
4155 Info.offset = 0;
4157 Info.align = Align(16);
4158 return true;
4159 }
4160
4161 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4162 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4163 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4164 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4165 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4166 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4167 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4168 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4170 Info.memVT = MVT::v2i32;
4171 Info.ptrVal = I.getArgOperand(0);
4172 Info.offset = 0;
4174 Info.align = Align(8);
4175 return true;
4176 }
4177
4178 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4179 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4180 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4181 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4183 Info.memVT = MVT::v2f64;
4184 Info.ptrVal = I.getArgOperand(0);
4185 Info.offset = 0;
4187 Info.align = Align(16);
4188 return true;
4189 }
4190
4191 case Intrinsic::nvvm_atomic_load_inc_32:
4192 case Intrinsic::nvvm_atomic_load_dec_32:
4193
4194 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4195 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4196 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4197 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4198 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4199 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4200 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4201 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4202 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4203 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4204 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4205 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4206 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4207 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4208 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4209 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4210 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4211 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4212 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4213 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4214 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4215 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4216 auto &DL = I.getModule()->getDataLayout();
4218 Info.memVT = getValueType(DL, I.getType());
4219 Info.ptrVal = I.getArgOperand(0);
4220 Info.offset = 0;
4222 Info.align.reset();
4223 return true;
4224 }
4225
4226 case Intrinsic::nvvm_ldu_global_i:
4227 case Intrinsic::nvvm_ldu_global_f:
4228 case Intrinsic::nvvm_ldu_global_p: {
4229 auto &DL = I.getModule()->getDataLayout();
4231 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4232 Info.memVT = getValueType(DL, I.getType());
4233 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4234 Info.memVT = getPointerTy(DL);
4235 else
4236 Info.memVT = getValueType(DL, I.getType());
4237 Info.ptrVal = I.getArgOperand(0);
4238 Info.offset = 0;
4240 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4241
4242 return true;
4243 }
4244 case Intrinsic::nvvm_ldg_global_i:
4245 case Intrinsic::nvvm_ldg_global_f:
4246 case Intrinsic::nvvm_ldg_global_p: {
4247 auto &DL = I.getModule()->getDataLayout();
4248
4250 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4251 Info.memVT = getValueType(DL, I.getType());
4252 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4253 Info.memVT = getPointerTy(DL);
4254 else
4255 Info.memVT = getValueType(DL, I.getType());
4256 Info.ptrVal = I.getArgOperand(0);
4257 Info.offset = 0;
4259 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4260
4261 return true;
4262 }
4263
4264 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4265 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4266 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: