LLVM 19.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
35#include "llvm/IR/Argument.h"
36#include "llvm/IR/Attributes.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
41#include "llvm/IR/FPEnv.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/GlobalValue.h"
44#include "llvm/IR/Instruction.h"
46#include "llvm/IR/IntrinsicsNVPTX.h"
47#include "llvm/IR/Module.h"
48#include "llvm/IR/Type.h"
49#include "llvm/IR/Value.h"
58#include <algorithm>
59#include <cassert>
60#include <cmath>
61#include <cstdint>
62#include <iterator>
63#include <optional>
64#include <sstream>
65#include <string>
66#include <utility>
67#include <vector>
68
69#define DEBUG_TYPE "nvptx-lower"
70
71using namespace llvm;
72
73static std::atomic<unsigned> GlobalUniqueCallSite;
74
76 "nvptx-sched4reg",
77 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
78
80 "nvptx-fma-level", cl::Hidden,
81 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
82 " 1: do it 2: do it aggressively"),
83 cl::init(2));
84
86 "nvptx-prec-divf32", cl::Hidden,
87 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
88 " IEEE Compliant F32 div.rnd if available."),
89 cl::init(2));
90
92 "nvptx-prec-sqrtf32", cl::Hidden,
93 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
94 cl::init(true));
95
97 "nvptx-force-min-byval-param-align", cl::Hidden,
98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
99 " params of device functions."),
100 cl::init(false));
101
103 if (UsePrecDivF32.getNumOccurrences() > 0) {
104 // If nvptx-prec-div32=N is used on the command-line, always honor it
105 return UsePrecDivF32;
106 } else {
107 // Otherwise, use div.approx if fast math is enabled
108 if (getTargetMachine().Options.UnsafeFPMath)
109 return 0;
110 else
111 return 2;
112 }
113}
114
116 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
118 return UsePrecSqrtF32;
119 } else {
120 // Otherwise, use sqrt.approx if fast math is enabled
122 }
123}
124
128}
129
130static bool IsPTXVectorType(MVT VT) {
131 switch (VT.SimpleTy) {
132 default:
133 return false;
134 case MVT::v2i1:
135 case MVT::v4i1:
136 case MVT::v2i8:
137 case MVT::v4i8:
138 case MVT::v2i16:
139 case MVT::v4i16:
140 case MVT::v8i16: // <4 x i16x2>
141 case MVT::v2i32:
142 case MVT::v4i32:
143 case MVT::v2i64:
144 case MVT::v2f16:
145 case MVT::v4f16:
146 case MVT::v8f16: // <4 x f16x2>
147 case MVT::v2bf16:
148 case MVT::v4bf16:
149 case MVT::v8bf16: // <4 x bf16x2>
150 case MVT::v2f32:
151 case MVT::v4f32:
152 case MVT::v2f64:
153 return true;
154 }
155}
156
157static bool Is16bitsType(MVT VT) {
158 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
159 VT.SimpleTy == MVT::i16);
160}
161
162/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
163/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
164/// into their primitive components.
165/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
166/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
167/// LowerCall, and LowerReturn.
168static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
169 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
170 SmallVectorImpl<uint64_t> *Offsets = nullptr,
171 uint64_t StartingOffset = 0) {
172 SmallVector<EVT, 16> TempVTs;
173 SmallVector<uint64_t, 16> TempOffsets;
174
175 // Special case for i128 - decompose to (i64, i64)
176 if (Ty->isIntegerTy(128)) {
177 ValueVTs.push_back(EVT(MVT::i64));
178 ValueVTs.push_back(EVT(MVT::i64));
179
180 if (Offsets) {
181 Offsets->push_back(StartingOffset + 0);
182 Offsets->push_back(StartingOffset + 8);
183 }
184
185 return;
186 }
187
188 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
189 if (StructType *STy = dyn_cast<StructType>(Ty)) {
190 auto const *SL = DL.getStructLayout(STy);
191 auto ElementNum = 0;
192 for(auto *EI : STy->elements()) {
193 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
194 StartingOffset + SL->getElementOffset(ElementNum));
195 ++ElementNum;
196 }
197 return;
198 }
199
200 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
201 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
202 EVT VT = TempVTs[i];
203 uint64_t Off = TempOffsets[i];
204 // Split vectors into individual elements, except for v2f16, which
205 // we will pass as a single scalar.
206 if (VT.isVector()) {
207 unsigned NumElts = VT.getVectorNumElements();
208 EVT EltVT = VT.getVectorElementType();
209 // Vectors with an even number of f16 elements will be passed to
210 // us as an array of v2f16/v2bf16 elements. We must match this so we
211 // stay in sync with Ins/Outs.
212 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
213 switch (EltVT.getSimpleVT().SimpleTy) {
214 case MVT::f16:
215 EltVT = MVT::v2f16;
216 break;
217 case MVT::bf16:
218 EltVT = MVT::v2bf16;
219 break;
220 case MVT::i16:
221 EltVT = MVT::v2i16;
222 break;
223 default:
224 llvm_unreachable("Unexpected type");
225 }
226 NumElts /= 2;
227 } else if (EltVT.getSimpleVT() == MVT::i8 &&
228 (NumElts % 4 == 0 || NumElts == 3)) {
229 // v*i8 are formally lowered as v4i8
230 EltVT = MVT::v4i8;
231 NumElts = (NumElts + 3) / 4;
232 }
233 for (unsigned j = 0; j != NumElts; ++j) {
234 ValueVTs.push_back(EltVT);
235 if (Offsets)
236 Offsets->push_back(Off + j * EltVT.getStoreSize());
237 }
238 } else {
239 ValueVTs.push_back(VT);
240 if (Offsets)
241 Offsets->push_back(Off);
242 }
243 }
244}
245
246/// PromoteScalarIntegerPTX
247/// Used to make sure the arguments/returns are suitable for passing
248/// and promote them to a larger size if they're not.
249///
250/// The promoted type is placed in \p PromoteVT if the function returns true.
251static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
252 if (VT.isScalarInteger()) {
253 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
254 default:
256 "Promotion is not suitable for scalars of size larger than 64-bits");
257 case 1:
258 *PromotedVT = MVT::i1;
259 break;
260 case 2:
261 case 4:
262 case 8:
263 *PromotedVT = MVT::i8;
264 break;
265 case 16:
266 *PromotedVT = MVT::i16;
267 break;
268 case 32:
269 *PromotedVT = MVT::i32;
270 break;
271 case 64:
272 *PromotedVT = MVT::i64;
273 break;
274 }
275 return EVT(*PromotedVT) != VT;
276 }
277 return false;
278}
279
280// Check whether we can merge loads/stores of some of the pieces of a
281// flattened function parameter or return value into a single vector
282// load/store.
283//
284// The flattened parameter is represented as a list of EVTs and
285// offsets, and the whole structure is aligned to ParamAlignment. This
286// function determines whether we can load/store pieces of the
287// parameter starting at index Idx using a single vectorized op of
288// size AccessSize. If so, it returns the number of param pieces
289// covered by the vector op. Otherwise, it returns 1.
291 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
292 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
293
294 // Can't vectorize if param alignment is not sufficient.
295 if (ParamAlignment < AccessSize)
296 return 1;
297 // Can't vectorize if offset is not aligned.
298 if (Offsets[Idx] & (AccessSize - 1))
299 return 1;
300
301 EVT EltVT = ValueVTs[Idx];
302 unsigned EltSize = EltVT.getStoreSize();
303
304 // Element is too large to vectorize.
305 if (EltSize >= AccessSize)
306 return 1;
307
308 unsigned NumElts = AccessSize / EltSize;
309 // Can't vectorize if AccessBytes if not a multiple of EltSize.
310 if (AccessSize != EltSize * NumElts)
311 return 1;
312
313 // We don't have enough elements to vectorize.
314 if (Idx + NumElts > ValueVTs.size())
315 return 1;
316
317 // PTX ISA can only deal with 2- and 4-element vector ops.
318 if (NumElts != 4 && NumElts != 2)
319 return 1;
320
321 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
322 // Types do not match.
323 if (ValueVTs[j] != EltVT)
324 return 1;
325
326 // Elements are not contiguous.
327 if (Offsets[j] - Offsets[j - 1] != EltSize)
328 return 1;
329 }
330 // OK. We can vectorize ValueVTs[i..i+NumElts)
331 return NumElts;
332}
333
334// Flags for tracking per-element vectorization state of loads/stores
335// of a flattened function parameter or return value.
337 PVF_INNER = 0x0, // Middle elements of a vector.
338 PVF_FIRST = 0x1, // First element of the vector.
339 PVF_LAST = 0x2, // Last element of the vector.
340 // Scalar is effectively a 1-element vector.
343
344// Computes whether and how we can vectorize the loads/stores of a
345// flattened function parameter or return value.
346//
347// The flattened parameter is represented as the list of ValueVTs and
348// Offsets, and is aligned to ParamAlignment bytes. We return a vector
349// of the same size as ValueVTs indicating how each piece should be
350// loaded/stored (i.e. as a scalar, or as part of a vector
351// load/store).
354 const SmallVectorImpl<uint64_t> &Offsets,
355 Align ParamAlignment, bool IsVAArg = false) {
356 // Set vector size to match ValueVTs and mark all elements as
357 // scalars by default.
359 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
360
361 if (IsVAArg)
362 return VectorInfo;
363
364 // Check what we can vectorize using 128/64/32-bit accesses.
365 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
366 // Skip elements we've already processed.
367 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
368 for (unsigned AccessSize : {16, 8, 4, 2}) {
369 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
370 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
371 // Mark vectorized elements.
372 switch (NumElts) {
373 default:
374 llvm_unreachable("Unexpected return value");
375 case 1:
376 // Can't vectorize using this size, try next smaller size.
377 continue;
378 case 2:
379 assert(I + 1 < E && "Not enough elements.");
380 VectorInfo[I] = PVF_FIRST;
381 VectorInfo[I + 1] = PVF_LAST;
382 I += 1;
383 break;
384 case 4:
385 assert(I + 3 < E && "Not enough elements.");
386 VectorInfo[I] = PVF_FIRST;
387 VectorInfo[I + 1] = PVF_INNER;
388 VectorInfo[I + 2] = PVF_INNER;
389 VectorInfo[I + 3] = PVF_LAST;
390 I += 3;
391 break;
392 }
393 // Break out of the inner loop because we've already succeeded
394 // using largest possible AccessSize.
395 break;
396 }
397 }
398 return VectorInfo;
399}
400
401// NVPTXTargetLowering Constructor.
403 const NVPTXSubtarget &STI)
404 : TargetLowering(TM), nvTM(&TM), STI(STI) {
405 // always lower memset, memcpy, and memmove intrinsics to load/store
406 // instructions, rather
407 // then generating calls to memset, mempcy or memmove.
411
414
415 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
416 // condition branches.
417 setJumpIsExpensive(true);
418
419 // Wide divides are _very_ slow. Try to reduce the width of the divide if
420 // possible.
421 addBypassSlowDiv(64, 32);
422
423 // By default, use the Source scheduling
424 if (sched4reg)
426 else
428
429 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
430 LegalizeAction NoF16Action) {
431 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
432 };
433
434 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
435 LegalizeAction NoBF16Action) {
436 bool IsOpSupported = STI.hasBF16Math();
437 // Few instructions are available on sm_90 only
438 switch(Op) {
439 case ISD::FADD:
440 case ISD::FMUL:
441 case ISD::FSUB:
442 case ISD::SELECT:
443 case ISD::SELECT_CC:
444 case ISD::SETCC:
445 case ISD::FEXP2:
446 case ISD::FCEIL:
447 case ISD::FFLOOR:
448 case ISD::FNEARBYINT:
449 case ISD::FRINT:
450 case ISD::FTRUNC:
451 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
452 break;
453 }
455 Op, VT, IsOpSupported ? Action : NoBF16Action);
456 };
457
458 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
459 LegalizeAction NoI16x2Action) {
460 bool IsOpSupported = false;
461 // instructions are available on sm_90 only
462 switch (Op) {
463 case ISD::ADD:
464 case ISD::SMAX:
465 case ISD::SMIN:
466 case ISD::UMIN:
467 case ISD::UMAX:
468 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
469 break;
470 }
471 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
472 };
473
474 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
475 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
476 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
477 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
478 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
479 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
480 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
481 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
482 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
483 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
484 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
485 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
486
487 // Conversion to/from FP16/FP16x2 is always legal.
492
494 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
496
497 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
498 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
499
500 // Conversion to/from BFP16/BFP16x2 is always legal.
505
506 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
507 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
508 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
509 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
510
511 // Conversion to/from i16/i16x2 is always legal.
516
521 // Only logical ops can be done on v4i8 directly, others must be done
522 // elementwise.
539 MVT::v4i8, Expand);
540
541 // Operations not directly supported by NVPTX.
542 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
543 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
544 MVT::i32, MVT::i64}) {
547 }
548
549 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
550 // For others we will expand to a SHL/SRA pair.
557
564
567
568 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
569 // that don't have h/w rotation we lower them to multi-instruction assembly.
570 // See ROT*_sw in NVPTXIntrInfo.td
575
577 setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
579 setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
583
584 // Indirect branch is not supported.
585 // This also disables Jump Table creation.
588
591
592 // We want to legalize constant related memmove and memcopy
593 // intrinsics.
595
596 // Turn FP extload into load/fpextend
597 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
598 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
599 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
600 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
601 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
602 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
605 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
606 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
607 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
610 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
611 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
612 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
614 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
615 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
616 // Turn FP truncstore into trunc + store.
617 // FIXME: vector types should also be expanded
618 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
619 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
620 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
621 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
622 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
623
624 // PTX does not support load / store predicate registers
627
628 for (MVT VT : MVT::integer_valuetypes()) {
631 setTruncStoreAction(VT, MVT::i1, Expand);
632 }
633
634 // expand extload of vector of integers.
636 MVT::v2i8, Expand);
637 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
638
639 // This is legal in NVPTX
644
647
648 // TRAP can be lowered to PTX trap
649 setOperationAction(ISD::TRAP, MVT::Other, Legal);
650
651 // Register custom handling for vector loads/stores
653 if (IsPTXVectorType(VT)) {
657 }
658 }
659
660 // Support varargs.
665
666 // Custom handling for i8 intrinsics
668
669 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
675
678 }
679
680 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
681 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
682 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
683 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
684 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
685 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
686 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
687
688 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
689 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
690 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
691 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
692 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
693 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
694
695 // Other arithmetic and logic ops are unsupported.
699 MVT::v2i16, Expand);
700
705 if (STI.getPTXVersion() >= 43) {
710 }
711
713 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
716
717 // PTX does not directly support SELP of i1, so promote to i32 first
719
720 // PTX cannot multiply two i64s in a single instruction.
723
724 // We have some custom DAG combine patterns for these nodes
727 ISD::VSELECT});
728
729 // setcc for f16x2 and bf16x2 needs special handling to prevent
730 // legalizer's attempt to scalarize it due to v2i1 not being legal.
731 if (STI.allowFP16Math() || STI.hasBF16Math())
733
734 // Promote fp16 arithmetic if fp16 hardware isn't available or the
735 // user passed --nvptx-no-fp16-math. The flag is useful because,
736 // although sm_53+ GPUs have some sort of FP16 support in
737 // hardware, only sm_53 and sm_60 have full implementation. Others
738 // only have token amount of hardware and are likely to run faster
739 // by using fp32 units instead.
740 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
741 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
742 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
743 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
744 // bf16 must be promoted to f32.
745 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
746 if (getOperationAction(Op, MVT::bf16) == Promote)
747 AddPromotedToType(Op, MVT::bf16, MVT::f32);
748 }
749
750 // f16/f16x2 neg was introduced in PTX 60, SM_53.
751 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
752 STI.getPTXVersion() >= 60 &&
753 STI.allowFP16Math();
754 for (const auto &VT : {MVT::f16, MVT::v2f16})
756 IsFP16FP16x2NegAvailable ? Legal : Expand);
757
758 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
759 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
760 // (would be) Library functions.
761
762 // These map to conversion instructions for scalar FP types.
763 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
765 setOperationAction(Op, MVT::f16, Legal);
766 setOperationAction(Op, MVT::f32, Legal);
767 setOperationAction(Op, MVT::f64, Legal);
768 setOperationAction(Op, MVT::v2f16, Expand);
769 setOperationAction(Op, MVT::v2bf16, Expand);
770 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
771 if (getOperationAction(Op, MVT::bf16) == Promote)
772 AddPromotedToType(Op, MVT::bf16, MVT::f32);
773 }
774
775 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
777 }
778 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
779 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
782 }
783 }
784
785 // sm_80 only has conversions between f32 and bf16. Custom lower all other
786 // bf16 conversions.
787 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
788 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
791 VT, Custom);
792 }
795 MVT::bf16, Custom);
796 }
797
804 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
805
806 // 'Expand' implements FCOPYSIGN without calling an external library.
813
814 // These map to corresponding instructions for f32/f64. f16 must be
815 // promoted to f32. v2f16 is expanded to f16, which is then promoted
816 // to f32.
817 for (const auto &Op :
819 setOperationAction(Op, MVT::f16, Promote);
820 setOperationAction(Op, MVT::f32, Legal);
821 setOperationAction(Op, MVT::f64, Legal);
822 setOperationAction(Op, MVT::v2f16, Expand);
823 setOperationAction(Op, MVT::v2bf16, Expand);
824 setOperationAction(Op, MVT::bf16, Promote);
825 AddPromotedToType(Op, MVT::bf16, MVT::f32);
826 }
827 for (const auto &Op : {ISD::FABS}) {
828 setOperationAction(Op, MVT::f16, Promote);
829 setOperationAction(Op, MVT::f32, Legal);
830 setOperationAction(Op, MVT::f64, Legal);
831 setOperationAction(Op, MVT::v2f16, Expand);
832 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
833 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
834 if (getOperationAction(Op, MVT::bf16) == Promote)
835 AddPromotedToType(Op, MVT::bf16, MVT::f32);
836 }
837
838 // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
839 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
840 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
841 return IsAtLeastSm80 ? Legal : NotSm80Action;
842 };
843 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
844 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
845 setOperationAction(Op, MVT::f32, Legal);
846 setOperationAction(Op, MVT::f64, Legal);
847 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
848 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
849 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
850 if (getOperationAction(Op, MVT::bf16) == Promote)
851 AddPromotedToType(Op, MVT::bf16, MVT::f32);
852 }
853 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
854 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
855 setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
856 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
857 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
858 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
859 }
860
861 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
862 // No FPOW or FREM in PTX.
863
864 // Now deduce the information based on the above mentioned
865 // actions
867
870}
871
872const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
873
874#define MAKE_CASE(V) \
875 case V: \
876 return #V;
877
878 switch ((NVPTXISD::NodeType)Opcode) {
880 break;
881
1025
1116
1128
1140
1152
1164
1176
1188
1200
1212
1224
1236
1248
1260
1272
1284
1296 }
1297 return nullptr;
1298
1299#undef MAKE_CASE
1300}
1301
1304 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1305 VT.getScalarType() == MVT::i1)
1306 return TypeSplitVector;
1307 if (Isv2x16VT(VT))
1308 return TypeLegal;
1310}
1311
1313 int Enabled, int &ExtraSteps,
1314 bool &UseOneConst,
1315 bool Reciprocal) const {
1318 return SDValue();
1319
1320 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1321 ExtraSteps = 0;
1322
1323 SDLoc DL(Operand);
1324 EVT VT = Operand.getValueType();
1325 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1326
1327 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1328 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1329 DAG.getConstant(IID, DL, MVT::i32), Operand);
1330 };
1331
1332 // The sqrt and rsqrt refinement processes assume we always start out with an
1333 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1334 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1335 // any refinement, we must return a regular sqrt.
1336 if (Reciprocal || ExtraSteps > 0) {
1337 if (VT == MVT::f32)
1338 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1339 : Intrinsic::nvvm_rsqrt_approx_f);
1340 else if (VT == MVT::f64)
1341 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1342 else
1343 return SDValue();
1344 } else {
1345 if (VT == MVT::f32)
1346 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1347 : Intrinsic::nvvm_sqrt_approx_f);
1348 else {
1349 // There's no sqrt.approx.f64 instruction, so we emit
1350 // reciprocal(rsqrt(x)). This is faster than
1351 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1352 // x * rsqrt(x).)
1353 return DAG.getNode(
1355 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1356 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1357 }
1358 }
1359}
1360
1361SDValue
1363 SDLoc dl(Op);
1364 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1365 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1366 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1367 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1368}
1369
1370static bool IsTypePassedAsArray(const Type *Ty) {
1371 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1372 Ty->isHalfTy() || Ty->isBFloatTy();
1373}
1374
1376 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1377 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1378 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1379 const CallBase &CB, unsigned UniqueCallSite) const {
1380 auto PtrVT = getPointerTy(DL);
1381
1382 bool isABI = (STI.getSmVersion() >= 20);
1383 assert(isABI && "Non-ABI compilation is not supported");
1384 if (!isABI)
1385 return "";
1386
1387 std::string Prototype;
1388 raw_string_ostream O(Prototype);
1389 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1390
1391 if (retTy->getTypeID() == Type::VoidTyID) {
1392 O << "()";
1393 } else {
1394 O << "(";
1395 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1396 !IsTypePassedAsArray(retTy)) {
1397 unsigned size = 0;
1398 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1399 size = ITy->getBitWidth();
1400 } else {
1401 assert(retTy->isFloatingPointTy() &&
1402 "Floating point type expected here");
1403 size = retTy->getPrimitiveSizeInBits();
1404 }
1405 // PTX ABI requires all scalar return values to be at least 32
1406 // bits in size. fp16 normally uses .b16 as its storage type in
1407 // PTX, so its size must be adjusted here, too.
1409
1410 O << ".param .b" << size << " _";
1411 } else if (isa<PointerType>(retTy)) {
1412 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1413 } else if (IsTypePassedAsArray(retTy)) {
1414 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1415 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1416 } else {
1417 llvm_unreachable("Unknown return type");
1418 }
1419 O << ") ";
1420 }
1421 O << "_ (";
1422
1423 bool first = true;
1424
1425 const Function *F = CB.getFunction();
1426 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1427 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1428 Type *Ty = Args[i].Ty;
1429 if (!first) {
1430 O << ", ";
1431 }
1432 first = false;
1433
1434 if (!Outs[OIdx].Flags.isByVal()) {
1435 if (IsTypePassedAsArray(Ty)) {
1436 unsigned ParamAlign = 0;
1437 const CallInst *CallI = cast<CallInst>(&CB);
1438 // +1 because index 0 is reserved for return type alignment
1439 if (!getAlign(*CallI, i + 1, ParamAlign))
1440 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1441 O << ".param .align " << ParamAlign << " .b8 ";
1442 O << "_";
1443 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1444 // update the index for Outs
1445 SmallVector<EVT, 16> vtparts;
1446 ComputeValueVTs(*this, DL, Ty, vtparts);
1447 if (unsigned len = vtparts.size())
1448 OIdx += len - 1;
1449 continue;
1450 }
1451 // i8 types in IR will be i16 types in SDAG
1452 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1453 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1454 "type mismatch between callee prototype and arguments");
1455 // scalar type
1456 unsigned sz = 0;
1457 if (isa<IntegerType>(Ty)) {
1458 sz = cast<IntegerType>(Ty)->getBitWidth();
1460 } else if (isa<PointerType>(Ty)) {
1461 sz = PtrVT.getSizeInBits();
1462 } else {
1463 sz = Ty->getPrimitiveSizeInBits();
1464 }
1465 O << ".param .b" << sz << " ";
1466 O << "_";
1467 continue;
1468 }
1469
1470 Type *ETy = Args[i].IndirectType;
1471 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1472 Align ParamByValAlign =
1473 getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1474
1475 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1476 O << "_";
1477 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1478 }
1479
1480 if (VAInfo)
1481 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1482 << " .b8 _[]\n";
1483 O << ")";
1485 O << " .noreturn";
1486 O << ";";
1487
1488 return Prototype;
1489}
1490
1491Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1492 unsigned Idx,
1493 const DataLayout &DL) const {
1494 if (!CB) {
1495 // CallSite is zero, fallback to ABI type alignment
1496 return DL.getABITypeAlign(Ty);
1497 }
1498
1499 unsigned Alignment = 0;
1500 const Function *DirectCallee = CB->getCalledFunction();
1501
1502 if (!DirectCallee) {
1503 // We don't have a direct function symbol, but that may be because of
1504 // constant cast instructions in the call.
1505
1506 // With bitcast'd call targets, the instruction will be the call
1507 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1508 // Check if we have call alignment metadata
1509 if (getAlign(*CI, Idx, Alignment))
1510 return Align(Alignment);
1511 }
1512 DirectCallee = getMaybeBitcastedCallee(CB);
1513 }
1514
1515 // Check for function alignment information if we found that the
1516 // ultimate target is a Function
1517 if (DirectCallee) {
1518 if (getAlign(*DirectCallee, Idx, Alignment))
1519 return Align(Alignment);
1520 // If alignment information is not available, fall back to the
1521 // default function param optimized type alignment
1522 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1523 }
1524
1525 // Call is indirect, fall back to the ABI type alignment
1526 return DL.getABITypeAlign(Ty);
1527}
1528
1529static bool adjustElementType(EVT &ElementType) {
1530 switch (ElementType.getSimpleVT().SimpleTy) {
1531 default:
1532 return false;
1533 case MVT::f16:
1534 case MVT::bf16:
1535 ElementType = MVT::i16;
1536 return true;
1537 case MVT::f32:
1538 case MVT::v2f16:
1539 case MVT::v2bf16:
1540 ElementType = MVT::i32;
1541 return true;
1542 case MVT::f64:
1543 ElementType = MVT::i64;
1544 return true;
1545 }
1546}
1547
1548// Use byte-store when the param address of the argument value is unaligned.
1549// This may happen when the return value is a field of a packed structure.
1550//
1551// This is called in LowerCall() when passing the param values.
1553 uint64_t Offset, EVT ElementType,
1554 SDValue StVal, SDValue &InGlue,
1555 unsigned ArgID, const SDLoc &dl) {
1556 // Bit logic only works on integer types
1557 if (adjustElementType(ElementType))
1558 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1559
1560 // Store each byte
1561 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1562 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1563 // Shift the byte to the last byte position
1564 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1565 DAG.getConstant(i * 8, dl, MVT::i32));
1566 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1567 DAG.getConstant(Offset + i, dl, MVT::i32),
1568 ShiftVal, InGlue};
1569 // Trunc store only the last byte by using
1570 // st.param.b8
1571 // The register type can be larger than b8.
1572 Chain = DAG.getMemIntrinsicNode(
1573 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1575 InGlue = Chain.getValue(1);
1576 }
1577 return Chain;
1578}
1579
1580// Use byte-load when the param adress of the returned value is unaligned.
1581// This may happen when the returned value is a field of a packed structure.
1582static SDValue
1584 EVT ElementType, SDValue &InGlue,
1585 SmallVectorImpl<SDValue> &TempProxyRegOps,
1586 const SDLoc &dl) {
1587 // Bit logic only works on integer types
1588 EVT MergedType = ElementType;
1589 adjustElementType(MergedType);
1590
1591 // Load each byte and construct the whole value. Initial value to 0
1592 SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1593 // LoadParamMemI8 loads into i16 register only
1594 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1595 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1596 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1597 DAG.getConstant(Offset + i, dl, MVT::i32),
1598 InGlue};
1599 // This will be selected to LoadParamMemI8
1600 SDValue LdVal =
1601 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1602 MVT::i8, MachinePointerInfo(), Align(1));
1603 SDValue TmpLdVal = LdVal.getValue(0);
1604 Chain = LdVal.getValue(1);
1605 InGlue = LdVal.getValue(2);
1606
1607 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1608 TmpLdVal.getSimpleValueType(), TmpLdVal);
1609 TempProxyRegOps.push_back(TmpLdVal);
1610
1611 SDValue CMask = DAG.getConstant(255, dl, MergedType);
1612 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1613 // Need to extend the i16 register to the whole width.
1614 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1615 // Mask off the high bits. Leave only the lower 8bits.
1616 // Do this because we are using loadparam.b8.
1617 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1618 // Shift and merge
1619 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1620 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1621 }
1622 if (ElementType != MergedType)
1623 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1624
1625 return RetVal;
1626}
1627
1629 SmallVectorImpl<SDValue> &InVals) const {
1630
1631 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1633 "Support for variadic functions (unsized array parameter) introduced "
1634 "in PTX ISA version 6.0 and requires target sm_30.");
1635
1636 SelectionDAG &DAG = CLI.DAG;
1637 SDLoc dl = CLI.DL;
1639 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1641 SDValue Chain = CLI.Chain;
1642 SDValue Callee = CLI.Callee;
1643 bool &isTailCall = CLI.IsTailCall;
1644 ArgListTy &Args = CLI.getArgs();
1645 Type *RetTy = CLI.RetTy;
1646 const CallBase *CB = CLI.CB;
1647 const DataLayout &DL = DAG.getDataLayout();
1648
1649 bool isABI = (STI.getSmVersion() >= 20);
1650 assert(isABI && "Non-ABI compilation is not supported");
1651 if (!isABI)
1652 return Chain;
1653
1654 // Variadic arguments.
1655 //
1656 // Normally, for each argument, we declare a param scalar or a param
1657 // byte array in the .param space, and store the argument value to that
1658 // param scalar or array starting at offset 0.
1659 //
1660 // In the case of the first variadic argument, we declare a vararg byte array
1661 // with size 0. The exact size of this array isn't known at this point, so
1662 // it'll be patched later. All the variadic arguments will be stored to this
1663 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1664 // initially set to 0, so it can be used for non-variadic arguments (which use
1665 // 0 offset) to simplify the code.
1666 //
1667 // After all vararg is processed, 'VAOffset' holds the size of the
1668 // vararg byte array.
1669
1670 SDValue VADeclareParam; // vararg byte array
1671 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1672 unsigned VAOffset = 0; // current offset in the param array
1673
1674 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1675 SDValue TempChain = Chain;
1676 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1677 SDValue InGlue = Chain.getValue(1);
1678
1679 unsigned ParamCount = 0;
1680 // Args.size() and Outs.size() need not match.
1681 // Outs.size() will be larger
1682 // * if there is an aggregate argument with multiple fields (each field
1683 // showing up separately in Outs)
1684 // * if there is a vector argument with more than typical vector-length
1685 // elements (generally if more than 4) where each vector element is
1686 // individually present in Outs.
1687 // So a different index should be used for indexing into Outs/OutVals.
1688 // See similar issue in LowerFormalArguments.
1689 unsigned OIdx = 0;
1690 // Declare the .params or .reg need to pass values
1691 // to the function
1692 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1693 EVT VT = Outs[OIdx].VT;
1694 Type *Ty = Args[i].Ty;
1695 bool IsVAArg = (i >= CLI.NumFixedArgs);
1696 bool IsByVal = Outs[OIdx].Flags.isByVal();
1697
1700
1701 assert((!IsByVal || Args[i].IndirectType) &&
1702 "byval arg must have indirect type");
1703 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1704 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1705
1706 Align ArgAlign;
1707 if (IsByVal) {
1708 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1709 // so we don't need to worry whether it's naturally aligned or not.
1710 // See TargetLowering::LowerCallTo().
1711 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1712 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1713 InitialAlign, DL);
1714 if (IsVAArg)
1715 VAOffset = alignTo(VAOffset, ArgAlign);
1716 } else {
1717 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1718 }
1719
1720 unsigned TypeSize =
1721 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1722 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1723
1724 bool NeedAlign; // Does argument declaration specify alignment?
1725 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1726 if (IsVAArg) {
1727 if (ParamCount == FirstVAArg) {
1728 SDValue DeclareParamOps[] = {
1729 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1730 DAG.getConstant(ParamCount, dl, MVT::i32),
1731 DAG.getConstant(1, dl, MVT::i32), InGlue};
1732 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1733 DeclareParamVTs, DeclareParamOps);
1734 }
1735 NeedAlign = PassAsArray;
1736 } else if (PassAsArray) {
1737 // declare .param .align <align> .b8 .param<n>[<size>];
1738 SDValue DeclareParamOps[] = {
1739 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1740 DAG.getConstant(ParamCount, dl, MVT::i32),
1741 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1742 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1743 DeclareParamOps);
1744 NeedAlign = true;
1745 } else {
1746 // declare .param .b<size> .param<n>;
1747 if (VT.isInteger() || VT.isFloatingPoint()) {
1748 // PTX ABI requires integral types to be at least 32 bits in
1749 // size. FP16 is loaded/stored using i16, so it's handled
1750 // here as well.
1752 }
1753 SDValue DeclareScalarParamOps[] = {
1754 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1755 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1756 DAG.getConstant(0, dl, MVT::i32), InGlue};
1757 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1758 DeclareScalarParamOps);
1759 NeedAlign = false;
1760 }
1761 InGlue = Chain.getValue(1);
1762
1763 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1764 // than 32-bits are sign extended or zero extended, depending on
1765 // whether they are signed or unsigned types. This case applies
1766 // only to scalar parameters and not to aggregate values.
1767 bool ExtendIntegerParam =
1768 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1769
1770 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1771 SmallVector<SDValue, 6> StoreOperands;
1772 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1773 EVT EltVT = VTs[j];
1774 int CurOffset = Offsets[j];
1775 MaybeAlign PartAlign;
1776 if (NeedAlign)
1777 PartAlign = commonAlignment(ArgAlign, CurOffset);
1778
1779 SDValue StVal = OutVals[OIdx];
1780
1781 MVT PromotedVT;
1782 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1783 EltVT = EVT(PromotedVT);
1784 }
1785 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1787 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1788 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1789 }
1790
1791 if (IsByVal) {
1792 auto PtrVT = getPointerTy(DL);
1793 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1794 DAG.getConstant(CurOffset, dl, PtrVT));
1795 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1796 PartAlign);
1797 } else if (ExtendIntegerParam) {
1798 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1799 // zext/sext to i32
1800 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1802 dl, MVT::i32, StVal);
1803 }
1804
1805 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1806 // Use 16-bit registers for small stores as it's the
1807 // smallest general purpose register size supported by NVPTX.
1808 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1809 }
1810
1811 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1812 // scalar store. In such cases, fall back to byte stores.
1813 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1814 PartAlign.value() <
1815 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1816 assert(StoreOperands.empty() && "Unfinished preceeding store.");
1818 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1819 StVal, InGlue, ParamCount, dl);
1820
1821 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1822 // into the SDAG, so just move on to the next element.
1823 if (!IsByVal)
1824 ++OIdx;
1825 continue;
1826 }
1827
1828 // New store.
1829 if (VectorInfo[j] & PVF_FIRST) {
1830 assert(StoreOperands.empty() && "Unfinished preceding store.");
1831 StoreOperands.push_back(Chain);
1832 StoreOperands.push_back(
1833 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1834
1835 StoreOperands.push_back(DAG.getConstant(
1836 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1837 dl, MVT::i32));
1838 }
1839
1840 // Record the value to store.
1841 StoreOperands.push_back(StVal);
1842
1843 if (VectorInfo[j] & PVF_LAST) {
1844 unsigned NumElts = StoreOperands.size() - 3;
1846 switch (NumElts) {
1847 case 1:
1849 break;
1850 case 2:
1852 break;
1853 case 4:
1855 break;
1856 default:
1857 llvm_unreachable("Invalid vector info.");
1858 }
1859
1860 StoreOperands.push_back(InGlue);
1861
1862 // Adjust type of the store op if we've extended the scalar
1863 // return value.
1864 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1865
1866 Chain = DAG.getMemIntrinsicNode(
1867 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1868 TheStoreType, MachinePointerInfo(), PartAlign,
1870 InGlue = Chain.getValue(1);
1871
1872 // Cleanup.
1873 StoreOperands.clear();
1874
1875 // TODO: We may need to support vector types that can be passed
1876 // as scalars in variadic arguments.
1877 if (!IsByVal && IsVAArg) {
1878 assert(NumElts == 1 &&
1879 "Vectorization is expected to be disabled for variadics.");
1880 VAOffset += DL.getTypeAllocSize(
1881 TheStoreType.getTypeForEVT(*DAG.getContext()));
1882 }
1883 }
1884 if (!IsByVal)
1885 ++OIdx;
1886 }
1887 assert(StoreOperands.empty() && "Unfinished parameter store.");
1888 if (!IsByVal && VTs.size() > 0)
1889 --OIdx;
1890 ++ParamCount;
1891 if (IsByVal && IsVAArg)
1892 VAOffset += TypeSize;
1893 }
1894
1895 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1896 MaybeAlign retAlignment = std::nullopt;
1897
1898 // Handle Result
1899 if (Ins.size() > 0) {
1900 SmallVector<EVT, 16> resvtparts;
1901 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1902
1903 // Declare
1904 // .param .align N .b8 retval0[<size-in-bytes>], or
1905 // .param .b<size-in-bits> retval0
1906 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1907 if (!IsTypePassedAsArray(RetTy)) {
1908 resultsz = promoteScalarArgumentSize(resultsz);
1909 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1910 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1911 DAG.getConstant(resultsz, dl, MVT::i32),
1912 DAG.getConstant(0, dl, MVT::i32), InGlue };
1913 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1914 DeclareRetOps);
1915 InGlue = Chain.getValue(1);
1916 } else {
1917 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1918 assert(retAlignment && "retAlignment is guaranteed to be set");
1919 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1920 SDValue DeclareRetOps[] = {
1921 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1922 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1923 DAG.getConstant(0, dl, MVT::i32), InGlue};
1924 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1925 DeclareRetOps);
1926 InGlue = Chain.getValue(1);
1927 }
1928 }
1929
1930 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1931 // Set the size of the vararg param byte array if the callee is a variadic
1932 // function and the variadic part is not empty.
1933 if (HasVAArgs) {
1934 SDValue DeclareParamOps[] = {
1935 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1936 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1937 VADeclareParam.getOperand(4)};
1938 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1939 VADeclareParam->getVTList(), DeclareParamOps);
1940 }
1941
1942 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1943 // between them we must rely on the call site value which is valid for
1944 // indirect calls but is always null for libcalls.
1945 bool isIndirectCall = !Func && CB;
1946
1947 if (isa<ExternalSymbolSDNode>(Callee)) {
1948 Function* CalleeFunc = nullptr;
1949
1950 // Try to find the callee in the current module.
1951 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1952 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1953
1954 // Set the "libcall callee" attribute to indicate that the function
1955 // must always have a declaration.
1956 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1957 }
1958
1959 if (isIndirectCall) {
1960 // This is indirect function call case : PTX requires a prototype of the
1961 // form
1962 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1963 // to be emitted, and the label has to used as the last arg of call
1964 // instruction.
1965 // The prototype is embedded in a string and put as the operand for a
1966 // CallPrototype SDNode which will print out to the value of the string.
1967 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1968 std::string Proto = getPrototype(
1969 DL, RetTy, Args, Outs, retAlignment,
1970 HasVAArgs
1971 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1972 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1973 : std::nullopt,
1974 *CB, UniqueCallSite);
1975 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1976 SDValue ProtoOps[] = {
1977 Chain,
1978 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1979 InGlue,
1980 };
1981 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1982 InGlue = Chain.getValue(1);
1983 }
1984 // Op to just print "call"
1985 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1986 SDValue PrintCallOps[] = {
1987 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1988 };
1989 // We model convergent calls as separate opcodes.
1991 if (CLI.IsConvergent)
1994 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1995 InGlue = Chain.getValue(1);
1996
1997 // Ops to print out the function name
1998 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1999 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2000 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2001 InGlue = Chain.getValue(1);
2002
2003 // Ops to print out the param list
2004 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2005 SDValue CallArgBeginOps[] = { Chain, InGlue };
2006 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2007 CallArgBeginOps);
2008 InGlue = Chain.getValue(1);
2009
2010 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2011 ++i) {
2012 unsigned opcode;
2013 if (i == (e - 1))
2014 opcode = NVPTXISD::LastCallArg;
2015 else
2016 opcode = NVPTXISD::CallArg;
2017 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2018 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2019 DAG.getConstant(i, dl, MVT::i32), InGlue };
2020 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2021 InGlue = Chain.getValue(1);
2022 }
2023 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2024 SDValue CallArgEndOps[] = { Chain,
2025 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2026 InGlue };
2027 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2028 InGlue = Chain.getValue(1);
2029
2030 if (isIndirectCall) {
2031 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2032 SDValue PrototypeOps[] = {
2033 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2034 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2035 InGlue = Chain.getValue(1);
2036 }
2037
2038 SmallVector<SDValue, 16> ProxyRegOps;
2039 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2040 // An item of the vector is filled if the element does not need a ProxyReg
2041 // operation on it and should be added to InVals as is. ProxyRegOps and
2042 // ProxyRegTruncates contain empty/none items at the same index.
2044 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2045 // to use the values of `LoadParam`s and to be replaced later then
2046 // `CALLSEQ_END` is added.
2047 SmallVector<SDValue, 16> TempProxyRegOps;
2048
2049 // Generate loads from param memory/moves from registers for result
2050 if (Ins.size() > 0) {
2053 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2054 assert(VTs.size() == Ins.size() && "Bad value decomposition");
2055
2056 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2057 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2058
2059 SmallVector<EVT, 6> LoadVTs;
2060 int VecIdx = -1; // Index of the first element of the vector.
2061
2062 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2063 // 32-bits are sign extended or zero extended, depending on whether
2064 // they are signed or unsigned types.
2065 bool ExtendIntegerRetVal =
2066 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2067
2068 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2069 bool needTruncate = false;
2070 EVT TheLoadType = VTs[i];
2071 EVT EltType = Ins[i].VT;
2072 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2073 MVT PromotedVT;
2074
2075 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2076 TheLoadType = EVT(PromotedVT);
2077 EltType = EVT(PromotedVT);
2078 needTruncate = true;
2079 }
2080
2081 if (ExtendIntegerRetVal) {
2082 TheLoadType = MVT::i32;
2083 EltType = MVT::i32;
2084 needTruncate = true;
2085 } else if (TheLoadType.getSizeInBits() < 16) {
2086 if (VTs[i].isInteger())
2087 needTruncate = true;
2088 EltType = MVT::i16;
2089 }
2090
2091 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2092 // scalar load. In such cases, fall back to byte loads.
2093 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
2094 EltAlign < DL.getABITypeAlign(
2095 TheLoadType.getTypeForEVT(*DAG.getContext()))) {
2096 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2098 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
2099 ProxyRegOps.push_back(SDValue());
2100 ProxyRegTruncates.push_back(std::optional<MVT>());
2101 RetElts.resize(i);
2102 RetElts.push_back(Ret);
2103
2104 continue;
2105 }
2106
2107 // Record index of the very first element of the vector.
2108 if (VectorInfo[i] & PVF_FIRST) {
2109 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2110 VecIdx = i;
2111 }
2112
2113 LoadVTs.push_back(EltType);
2114
2115 if (VectorInfo[i] & PVF_LAST) {
2116 unsigned NumElts = LoadVTs.size();
2117 LoadVTs.push_back(MVT::Other);
2118 LoadVTs.push_back(MVT::Glue);
2120 switch (NumElts) {
2121 case 1:
2123 break;
2124 case 2:
2126 break;
2127 case 4:
2129 break;
2130 default:
2131 llvm_unreachable("Invalid vector info.");
2132 }
2133
2134 SDValue LoadOperands[] = {
2135 Chain, DAG.getConstant(1, dl, MVT::i32),
2136 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2137 SDValue RetVal = DAG.getMemIntrinsicNode(
2138 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2139 MachinePointerInfo(), EltAlign,
2141
2142 for (unsigned j = 0; j < NumElts; ++j) {
2143 ProxyRegOps.push_back(RetVal.getValue(j));
2144
2145 if (needTruncate)
2146 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2147 else
2148 ProxyRegTruncates.push_back(std::optional<MVT>());
2149 }
2150
2151 Chain = RetVal.getValue(NumElts);
2152 InGlue = RetVal.getValue(NumElts + 1);
2153
2154 // Cleanup
2155 VecIdx = -1;
2156 LoadVTs.clear();
2157 }
2158 }
2159 }
2160
2161 Chain =
2162 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2163 InGlue = Chain.getValue(1);
2164
2165 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2166 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2167 // dangling.
2168 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2169 if (i < RetElts.size() && RetElts[i]) {
2170 InVals.push_back(RetElts[i]);
2171 continue;
2172 }
2173
2174 SDValue Ret = DAG.getNode(
2176 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2177 { Chain, ProxyRegOps[i], InGlue }
2178 );
2179
2180 Chain = Ret.getValue(1);
2181 InGlue = Ret.getValue(2);
2182
2183 if (ProxyRegTruncates[i]) {
2184 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2185 }
2186
2187 InVals.push_back(Ret);
2188 }
2189
2190 for (SDValue &T : TempProxyRegOps) {
2191 SDValue Repl = DAG.getNode(
2193 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
2194 {Chain, T.getOperand(0), InGlue});
2195 DAG.ReplaceAllUsesWith(T, Repl);
2196 DAG.RemoveDeadNode(T.getNode());
2197
2198 Chain = Repl.getValue(1);
2199 InGlue = Repl.getValue(2);
2200 }
2201
2202 // set isTailCall to false for now, until we figure out how to express
2203 // tail call optimization in PTX
2204 isTailCall = false;
2205 return Chain;
2206}
2207
2209 SelectionDAG &DAG) const {
2210
2211 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2212 const Function &Fn = DAG.getMachineFunction().getFunction();
2213
2214 DiagnosticInfoUnsupported NoDynamicAlloca(
2215 Fn,
2216 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2217 "requires target sm_52.",
2218 SDLoc(Op).getDebugLoc());
2219 DAG.getContext()->diagnose(NoDynamicAlloca);
2220 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2221 Op.getOperand(0)};
2222 return DAG.getMergeValues(Ops, SDLoc());
2223 }
2224
2225 SDValue Chain = Op.getOperand(0);
2226 SDValue Size = Op.getOperand(1);
2227 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2228 SDLoc DL(Op.getNode());
2229
2230 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2231 if (nvTM->is64Bit())
2232 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
2233 else
2234 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
2235
2236 SDValue AllocOps[] = {Chain, Size,
2237 DAG.getTargetConstant(Align, DL, MVT::i32)};
2239 nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
2240
2241 SDValue MergeOps[] = {Alloca, Chain};
2242 return DAG.getMergeValues(MergeOps, DL);
2243}
2244
2245// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2246// (see LegalizeDAG.cpp). This is slow and uses local memory.
2247// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2248SDValue
2249NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2250 SDNode *Node = Op.getNode();
2251 SDLoc dl(Node);
2253 unsigned NumOperands = Node->getNumOperands();
2254 for (unsigned i = 0; i < NumOperands; ++i) {
2255 SDValue SubOp = Node->getOperand(i);
2256 EVT VVT = SubOp.getNode()->getValueType(0);
2257 EVT EltVT = VVT.getVectorElementType();
2258 unsigned NumSubElem = VVT.getVectorNumElements();
2259 for (unsigned j = 0; j < NumSubElem; ++j) {
2260 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2261 DAG.getIntPtrConstant(j, dl)));
2262 }
2263 }
2264 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2265}
2266
2267// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2268// would get lowered as two constant loads and vector-packing move.
2269// Instead we want just a constant move:
2270// mov.b32 %r2, 0x40003C00
2271SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2272 SelectionDAG &DAG) const {
2273 EVT VT = Op->getValueType(0);
2274 if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2275 return Op;
2276
2277 SDLoc DL(Op);
2278
2279 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2280 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2281 isa<ConstantFPSDNode>(Operand);
2282 })) {
2283 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2284 // to optimize calculation of constant parts.
2285 if (VT == MVT::v4i8) {
2286 SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2287 SDValue E01 = DAG.getNode(
2288 NVPTXISD::BFI, DL, MVT::i32,
2289 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2290 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2291 SDValue E012 =
2292 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2293 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2294 E01, DAG.getConstant(16, DL, MVT::i32), C8);
2295 SDValue E0123 =
2296 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2297 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2298 E012, DAG.getConstant(24, DL, MVT::i32), C8);
2299 return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2300 }
2301 return Op;
2302 }
2303
2304 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2305 auto GetOperand = [](SDValue Op, int N) -> APInt {
2306 const SDValue &Operand = Op->getOperand(N);
2307 EVT VT = Op->getValueType(0);
2308 if (Operand->isUndef())
2309 return APInt(32, 0);
2310 APInt Value;
2311 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2312 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2313 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2314 Value = Operand->getAsAPIntVal();
2315 else
2316 llvm_unreachable("Unsupported type");
2317 // i8 values are carried around as i16, so we need to zero out upper bits,
2318 // so they do not get in the way of combining individual byte values
2319 if (VT == MVT::v4i8)
2320 Value = Value.trunc(8);
2321 return Value.zext(32);
2322 };
2323 APInt Value;
2324 if (Isv2x16VT(VT)) {
2325 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2326 } else if (VT == MVT::v4i8) {
2327 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2328 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2329 } else {
2330 llvm_unreachable("Unsupported type");
2331 }
2332 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2333 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2334}
2335
2336SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2337 SelectionDAG &DAG) const {
2338 SDValue Index = Op->getOperand(1);
2339 SDValue Vector = Op->getOperand(0);
2340 SDLoc DL(Op);
2341 EVT VectorVT = Vector.getValueType();
2342
2343 if (VectorVT == MVT::v4i8) {
2344 SDValue BFE =
2345 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2346 {Vector,
2347 DAG.getNode(ISD::MUL, DL, MVT::i32,
2348 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2349 DAG.getConstant(8, DL, MVT::i32)),
2350 DAG.getConstant(8, DL, MVT::i32)});
2351 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2352 }
2353
2354 // Constant index will be matched by tablegen.
2355 if (isa<ConstantSDNode>(Index.getNode()))
2356 return Op;
2357
2358 // Extract individual elements and select one of them.
2359 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2360 EVT EltVT = VectorVT.getVectorElementType();
2361
2362 SDLoc dl(Op.getNode());
2363 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2364 DAG.getIntPtrConstant(0, dl));
2365 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2366 DAG.getIntPtrConstant(1, dl));
2367 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2369}
2370
2371SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2372 SelectionDAG &DAG) const {
2373 SDValue Vector = Op->getOperand(0);
2374 EVT VectorVT = Vector.getValueType();
2375
2376 if (VectorVT != MVT::v4i8)
2377 return Op;
2378 SDLoc DL(Op);
2379 SDValue Value = Op->getOperand(1);
2380 if (Value->isUndef())
2381 return Vector;
2382
2383 SDValue Index = Op->getOperand(2);
2384
2385 SDValue BFI =
2386 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2387 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2388 DAG.getNode(ISD::MUL, DL, MVT::i32,
2389 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2390 DAG.getConstant(8, DL, MVT::i32)),
2391 DAG.getConstant(8, DL, MVT::i32)});
2392 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2393}
2394
2395SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2396 SelectionDAG &DAG) const {
2397 SDValue V1 = Op.getOperand(0);
2398 EVT VectorVT = V1.getValueType();
2399 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2400 return Op;
2401
2402 // Lower shuffle to PRMT instruction.
2403 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2404 SDValue V2 = Op.getOperand(1);
2405 uint32_t Selector = 0;
2406 for (auto I : llvm::enumerate(SVN->getMask())) {
2407 if (I.value() != -1) // -1 is a placeholder for undef.
2408 Selector |= (I.value() << (I.index() * 4));
2409 }
2410
2411 SDLoc DL(Op);
2412 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2413 DAG.getConstant(Selector, DL, MVT::i32),
2414 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2415}
2416/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2417/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2418/// amount, or
2419/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2420/// amount.
2421SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2422 SelectionDAG &DAG) const {
2423 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2424 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2425
2426 EVT VT = Op.getValueType();
2427 unsigned VTBits = VT.getSizeInBits();
2428 SDLoc dl(Op);
2429 SDValue ShOpLo = Op.getOperand(0);
2430 SDValue ShOpHi = Op.getOperand(1);
2431 SDValue ShAmt = Op.getOperand(2);
2432 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2433
2434 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2435 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2436 // {dHi, dLo} = {aHi, aLo} >> Amt
2437 // dHi = aHi >> Amt
2438 // dLo = shf.r.clamp aLo, aHi, Amt
2439
2440 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2441 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2442 ShAmt);
2443
2444 SDValue Ops[2] = { Lo, Hi };
2445 return DAG.getMergeValues(Ops, dl);
2446 }
2447 else {
2448 // {dHi, dLo} = {aHi, aLo} >> Amt
2449 // - if (Amt>=size) then
2450 // dLo = aHi >> (Amt-size)
2451 // dHi = aHi >> Amt (this is either all 0 or all 1)
2452 // else
2453 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2454 // dHi = aHi >> Amt
2455
2456 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2457 DAG.getConstant(VTBits, dl, MVT::i32),
2458 ShAmt);
2459 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2460 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2461 DAG.getConstant(VTBits, dl, MVT::i32));
2462 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2463 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2464 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2465
2466 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2467 DAG.getConstant(VTBits, dl, MVT::i32),
2468 ISD::SETGE);
2469 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2470 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2471
2472 SDValue Ops[2] = { Lo, Hi };
2473 return DAG.getMergeValues(Ops, dl);
2474 }
2475}
2476
2477/// LowerShiftLeftParts - Lower SHL_PARTS, which
2478/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2479/// amount, or
2480/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2481/// amount.
2482SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2483 SelectionDAG &DAG) const {
2484 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2485 assert(Op.getOpcode() == ISD::SHL_PARTS);
2486
2487 EVT VT = Op.getValueType();
2488 unsigned VTBits = VT.getSizeInBits();
2489 SDLoc dl(Op);
2490 SDValue ShOpLo = Op.getOperand(0);
2491 SDValue ShOpHi = Op.getOperand(1);
2492 SDValue ShAmt = Op.getOperand(2);
2493
2494 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2495 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2496 // {dHi, dLo} = {aHi, aLo} << Amt
2497 // dHi = shf.l.clamp aLo, aHi, Amt
2498 // dLo = aLo << Amt
2499
2500 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2501 ShAmt);
2502 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2503
2504 SDValue Ops[2] = { Lo, Hi };
2505 return DAG.getMergeValues(Ops, dl);
2506 }
2507 else {
2508 // {dHi, dLo} = {aHi, aLo} << Amt
2509 // - if (Amt>=size) then
2510 // dLo = aLo << Amt (all 0)
2511 // dLo = aLo << (Amt-size)
2512 // else
2513 // dLo = aLo << Amt
2514 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2515
2516 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2517 DAG.getConstant(VTBits, dl, MVT::i32),
2518 ShAmt);
2519 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2520 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2521 DAG.getConstant(VTBits, dl, MVT::i32));
2522 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2523 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2524 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2525
2526 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2527 DAG.getConstant(VTBits, dl, MVT::i32),
2528 ISD::SETGE);
2529 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2530 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2531
2532 SDValue Ops[2] = { Lo, Hi };
2533 return DAG.getMergeValues(Ops, dl);
2534 }
2535}
2536
2537SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2538 EVT VT = Op.getValueType();
2539
2540 if (VT == MVT::f32)
2541 return LowerFROUND32(Op, DAG);
2542
2543 if (VT == MVT::f64)
2544 return LowerFROUND64(Op, DAG);
2545
2546 llvm_unreachable("unhandled type");
2547}
2548
2549// This is the the rounding method used in CUDA libdevice in C like code:
2550// float roundf(float A)
2551// {
2552// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2553// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2554// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2555// }
2556SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2557 SelectionDAG &DAG) const {
2558 SDLoc SL(Op);
2559 SDValue A = Op.getOperand(0);
2560 EVT VT = Op.getValueType();
2561
2562 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2563
2564 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2565 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2566 const int SignBitMask = 0x80000000;
2567 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2568 DAG.getConstant(SignBitMask, SL, MVT::i32));
2569 const int PointFiveInBits = 0x3F000000;
2570 SDValue PointFiveWithSignRaw =
2571 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2572 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2573 SDValue PointFiveWithSign =
2574 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2575 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2576 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2577
2578 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2579 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2580 SDValue IsLarge =
2581 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2582 ISD::SETOGT);
2583 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2584
2585 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2586 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2587 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2588 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2589 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2590}
2591
2592// The implementation of round(double) is similar to that of round(float) in
2593// that they both separate the value range into three regions and use a method
2594// specific to the region to round the values. However, round(double) first
2595// calculates the round of the absolute value and then adds the sign back while
2596// round(float) directly rounds the value with sign.
2597SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2598 SelectionDAG &DAG) const {
2599 SDLoc SL(Op);
2600 SDValue A = Op.getOperand(0);
2601 EVT VT = Op.getValueType();
2602
2603 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2604
2605 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2606 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2607 DAG.getConstantFP(0.5, SL, VT));
2608 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2609
2610 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2611 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2612 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2613 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2614 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2615 DAG.getConstantFP(0, SL, VT),
2616 RoundedA);
2617
2618 // Add sign to rounded_A
2619 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2620 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2621
2622 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2623 SDValue IsLarge =
2624 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2625 ISD::SETOGT);
2626 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2627}
2628
2629SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2630 SelectionDAG &DAG) const {
2631 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2632
2633 if (Op.getValueType() == MVT::bf16) {
2634 SDLoc Loc(Op);
2635 return DAG.getNode(
2636 ISD::FP_ROUND, Loc, MVT::bf16,
2637 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2638 DAG.getIntPtrConstant(0, Loc));
2639 }
2640
2641 // Everything else is considered legal.
2642 return Op;
2643}
2644
2645SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2646 SelectionDAG &DAG) const {
2647 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2648
2649 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2650 SDLoc Loc(Op);
2651 return DAG.getNode(
2652 Op.getOpcode(), Loc, Op.getValueType(),
2653 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2654 }
2655
2656 // Everything else is considered legal.
2657 return Op;
2658}
2659
2660SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2661 SelectionDAG &DAG) const {
2662 EVT NarrowVT = Op.getValueType();
2663 SDValue Wide = Op.getOperand(0);
2664 EVT WideVT = Wide.getValueType();
2665 if (NarrowVT.getScalarType() == MVT::bf16) {
2666 const TargetLowering *TLI = STI.getTargetLowering();
2667 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2668 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2669 }
2670 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2671 // This combination was the first to support f32 -> bf16.
2672 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2673 if (WideVT.getScalarType() == MVT::f32) {
2674 return Op;
2675 }
2676 if (WideVT.getScalarType() == MVT::f64) {
2677 SDLoc Loc(Op);
2678 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2679 // the hardware f32 -> bf16 instruction.
2681 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2682 : MVT::f32,
2683 Wide, Loc, DAG);
2684 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2685 }
2686 }
2687 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2688 }
2689 }
2690
2691 // Everything else is considered legal.
2692 return Op;
2693}
2694
2695SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2696 SelectionDAG &DAG) const {
2697 SDValue Narrow = Op.getOperand(0);
2698 EVT NarrowVT = Narrow.getValueType();
2699 EVT WideVT = Op.getValueType();
2700 if (NarrowVT.getScalarType() == MVT::bf16) {
2701 if (WideVT.getScalarType() == MVT::f32 &&
2702 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2703 SDLoc Loc(Op);
2704 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2705 }
2706 if (WideVT.getScalarType() == MVT::f64 &&
2707 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2708 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2709 : MVT::f32;
2710 SDLoc Loc(Op);
2711 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2712 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2713 } else {
2714 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2715 }
2716 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2717 }
2718 }
2719
2720 // Everything else is considered legal.
2721 return Op;
2722}
2723
2725 SDLoc DL(Op);
2726 if (Op.getValueType() != MVT::v2i16)
2727 return Op;
2728 EVT EltVT = Op.getValueType().getVectorElementType();
2729 SmallVector<SDValue> VecElements;
2730 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2731 SmallVector<SDValue> ScalarArgs;
2732 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2733 [&](const SDUse &O) {
2734 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2735 O.get(), DAG.getIntPtrConstant(I, DL));
2736 });
2737 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2738 }
2739 SDValue V =
2740 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2741 return V;
2742}
2743
2744SDValue
2746 switch (Op.getOpcode()) {
2747 case ISD::RETURNADDR:
2748 return SDValue();
2749 case ISD::FRAMEADDR:
2750 return SDValue();
2751 case ISD::GlobalAddress:
2752 return LowerGlobalAddress(Op, DAG);
2754 return Op;
2755 case ISD::BUILD_VECTOR:
2756 return LowerBUILD_VECTOR(Op, DAG);
2758 return Op;
2760 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2762 return LowerINSERT_VECTOR_ELT(Op, DAG);
2764 return LowerVECTOR_SHUFFLE(Op, DAG);
2766 return LowerCONCAT_VECTORS(Op, DAG);
2767 case ISD::STORE:
2768 return LowerSTORE(Op, DAG);
2769 case ISD::LOAD:
2770 return LowerLOAD(Op, DAG);
2771 case ISD::SHL_PARTS:
2772 return LowerShiftLeftParts(Op, DAG);
2773 case ISD::SRA_PARTS:
2774 case ISD::SRL_PARTS:
2775 return LowerShiftRightParts(Op, DAG);
2776 case ISD::SELECT:
2777 return LowerSelect(Op, DAG);
2778 case ISD::FROUND:
2779 return LowerFROUND(Op, DAG);
2780 case ISD::SINT_TO_FP:
2781 case ISD::UINT_TO_FP:
2782 return LowerINT_TO_FP(Op, DAG);
2783 case ISD::FP_TO_SINT:
2784 case ISD::FP_TO_UINT:
2785 return LowerFP_TO_INT(Op, DAG);
2786 case ISD::FP_ROUND:
2787 return LowerFP_ROUND(Op, DAG);
2788 case ISD::FP_EXTEND:
2789 return LowerFP_EXTEND(Op, DAG);
2790 case ISD::VAARG:
2791 return LowerVAARG(Op, DAG);
2792 case ISD::VASTART:
2793 return LowerVASTART(Op, DAG);
2794 case ISD::ABS:
2795 case ISD::SMIN:
2796 case ISD::SMAX:
2797 case ISD::UMIN:
2798 case ISD::UMAX:
2799 case ISD::ADD:
2800 case ISD::SUB:
2801 case ISD::MUL:
2802 case ISD::SHL:
2803 case ISD::SREM:
2804 case ISD::UREM:
2805 return LowerVectorArith(Op, DAG);
2807 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2808 default:
2809 llvm_unreachable("Custom lowering not defined for operation");
2810 }
2811}
2812
2813// This function is almost a copy of SelectionDAG::expandVAArg().
2814// The only diff is that this one produces loads from local address space.
2815SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2816 const TargetLowering *TLI = STI.getTargetLowering();
2817 SDLoc DL(Op);
2818
2819 SDNode *Node = Op.getNode();
2820 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2821 EVT VT = Node->getValueType(0);
2822 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2823 SDValue Tmp1 = Node->getOperand(0);
2824 SDValue Tmp2 = Node->getOperand(1);
2825 const MaybeAlign MA(Node->getConstantOperandVal(3));
2826
2827 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2828 Tmp1, Tmp2, MachinePointerInfo(V));
2829 SDValue VAList = VAListLoad;
2830
2831 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2832 VAList = DAG.getNode(
2833 ISD::ADD, DL, VAList.getValueType(), VAList,
2834 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2835
2836 VAList = DAG.getNode(
2837 ISD::AND, DL, VAList.getValueType(), VAList,
2838 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2839 }
2840
2841 // Increment the pointer, VAList, to the next vaarg
2842 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2844 DL, VAList.getValueType()));
2845
2846 // Store the incremented VAList to the legalized pointer
2847 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2849
2850 const Value *SrcV =
2852
2853 // Load the actual argument out of the pointer VAList
2854 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2855}
2856
2857SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2858 const TargetLowering *TLI = STI.getTargetLowering();
2859 SDLoc DL(Op);
2860 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2861
2862 // Store the address of unsized array <function>_vararg[] in the ap object.
2863 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2864 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2865
2866 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2867 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2868 MachinePointerInfo(SV));
2869}
2870
2871SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2872 SDValue Op0 = Op->getOperand(0);
2873 SDValue Op1 = Op->getOperand(1);
2874 SDValue Op2 = Op->getOperand(2);
2875 SDLoc DL(Op.getNode());
2876
2877 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2878
2879 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2880 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2881 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2882 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2883
2884 return Trunc;
2885}
2886
2887SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2888 if (Op.getValueType() == MVT::i1)
2889 return LowerLOADi1(Op, DAG);
2890
2891 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2892 // unaligned loads and have to handle it here.
2893 EVT VT = Op.getValueType();
2894 if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2895 LoadSDNode *Load = cast<LoadSDNode>(Op);
2896 EVT MemVT = Load->getMemoryVT();
2898 MemVT, *Load->getMemOperand())) {
2899 SDValue Ops[2];
2900 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2901 return DAG.getMergeValues(Ops, SDLoc(Op));
2902 }
2903 }
2904
2905 return SDValue();
2906}
2907
2908// v = ld i1* addr
2909// =>
2910// v1 = ld i8* addr (-> i16)
2911// v = trunc i16 to i1
2912SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2913 SDNode *Node = Op.getNode();
2914 LoadSDNode *LD = cast<LoadSDNode>(Node);
2915 SDLoc dl(Node);
2916 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2917 assert(Node->getValueType(0) == MVT::i1 &&
2918 "Custom lowering for i1 load only");
2919 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2920 LD->getPointerInfo(), LD->getAlign(),
2921 LD->getMemOperand()->getFlags());
2922 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2923 // The legalizer (the caller) is expecting two values from the legalized
2924 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2925 // in LegalizeDAG.cpp which also uses MergeValues.
2926 SDValue Ops[] = { result, LD->getChain() };
2927 return DAG.getMergeValues(Ops, dl);
2928}
2929
2930SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2931 StoreSDNode *Store = cast<StoreSDNode>(Op);
2932 EVT VT = Store->getMemoryVT();
2933
2934 if (VT == MVT::i1)
2935 return LowerSTOREi1(Op, DAG);
2936
2937 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2938 // stores and have to handle it here.
2939 if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2941 VT, *Store->getMemOperand()))
2942 return expandUnalignedStore(Store, DAG);
2943
2944 // v2f16, v2bf16 and v2i16 don't need special handling.
2945 if (Isv2x16VT(VT) || VT == MVT::v4i8)
2946 return SDValue();
2947
2948 if (VT.isVector())
2949 return LowerSTOREVector(Op, DAG);
2950
2951 return SDValue();
2952}
2953
2954SDValue
2955NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2956 SDNode *N = Op.getNode();
2957 SDValue Val = N->getOperand(1);
2958 SDLoc DL(N);
2959 EVT ValVT = Val.getValueType();
2960
2961 if (ValVT.isVector()) {
2962 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2963 // legal. We can (and should) split that into 2 stores of <2 x double> here
2964 // but I'm leaving that as a TODO for now.
2965 if (!ValVT.isSimple())
2966 return SDValue();
2967 switch (ValVT.getSimpleVT().SimpleTy) {
2968 default:
2969 return SDValue();
2970 case MVT::v2i8:
2971 case MVT::v2i16:
2972 case MVT::v2i32:
2973 case MVT::v2i64:
2974 case MVT::v2f16:
2975 case MVT::v2bf16:
2976 case MVT::v2f32:
2977 case MVT::v2f64:
2978 case MVT::v4i8:
2979 case MVT::v4i16:
2980 case MVT::v4i32:
2981 case MVT::v4f16:
2982 case MVT::v4bf16:
2983 case MVT::v4f32:
2984 case MVT::v8f16: // <4 x f16x2>
2985 case MVT::v8bf16: // <4 x bf16x2>
2986 case MVT::v8i16: // <4 x i16x2>
2987 // This is a "native" vector type
2988 break;
2989 }
2990
2991 MemSDNode *MemSD = cast<MemSDNode>(N);
2992 const DataLayout &TD = DAG.getDataLayout();
2993
2994 Align Alignment = MemSD->getAlign();
2995 Align PrefAlign =
2996 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2997 if (Alignment < PrefAlign) {
2998 // This store is not sufficiently aligned, so bail out and let this vector
2999 // store be scalarized. Note that we may still be able to emit smaller
3000 // vector stores. For example, if we are storing a <4 x float> with an
3001 // alignment of 8, this check will fail but the legalizer will try again
3002 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3003 return SDValue();
3004 }
3005
3006 unsigned Opcode = 0;
3007 EVT EltVT = ValVT.getVectorElementType();
3008 unsigned NumElts = ValVT.getVectorNumElements();
3009
3010 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3011 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3012 // stored type to i16 and propagate the "real" type as the memory type.
3013 bool NeedExt = false;
3014 if (EltVT.getSizeInBits() < 16)
3015 NeedExt = true;
3016
3017 bool StoreF16x2 = false;
3018 switch (NumElts) {
3019 default:
3020 return SDValue();
3021 case 2:
3022 Opcode = NVPTXISD::StoreV2;
3023 break;
3024 case 4:
3025 Opcode = NVPTXISD::StoreV4;
3026 break;
3027 case 8:
3028 // v8f16 is a special case. PTX doesn't have st.v8.f16
3029 // instruction. Instead, we split the vector into v2f16 chunks and
3030 // store them with st.v4.b32.
3031 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
3032 Opcode = NVPTXISD::StoreV4;
3033 StoreF16x2 = true;
3034 break;
3035 }
3036
3038
3039 // First is the chain
3040 Ops.push_back(N->getOperand(0));
3041
3042 if (StoreF16x2) {
3043 // Combine f16,f16 -> v2f16
3044 NumElts /= 2;
3045 for (unsigned i = 0; i < NumElts; ++i) {
3046 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3047 DAG.getIntPtrConstant(i * 2, DL));
3048 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3049 DAG.getIntPtrConstant(i * 2 + 1, DL));
3050 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
3051 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
3052 Ops.push_back(V2);
3053 }
3054 } else {
3055 // Then the split values
3056 for (unsigned i = 0; i < NumElts; ++i) {
3057 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3058 DAG.getIntPtrConstant(i, DL));
3059 if (NeedExt)
3060 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3061 Ops.push_back(ExtVal);
3062 }
3063 }
3064
3065 // Then any remaining arguments
3066 Ops.append(N->op_begin() + 2, N->op_end());
3067
3068 SDValue NewSt =
3069 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3070 MemSD->getMemoryVT(), MemSD->getMemOperand());
3071
3072 // return DCI.CombineTo(N, NewSt, true);
3073 return NewSt;
3074 }
3075
3076 return SDValue();
3077}
3078
3079// st i1 v, addr
3080// =>
3081// v1 = zxt v to i16
3082// st.u8 i16, addr
3083SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3084 SDNode *Node = Op.getNode();
3085 SDLoc dl(Node);
3086 StoreSDNode *ST = cast<StoreSDNode>(Node);
3087 SDValue Tmp1 = ST->getChain();
3088 SDValue Tmp2 = ST->getBasePtr();
3089 SDValue Tmp3 = ST->getValue();
3090 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3091 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3092 SDValue Result =
3093 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3094 ST->getAlign(), ST->getMemOperand()->getFlags());
3095 return Result;
3096}
3097
3098// This creates target external symbol for a function parameter.
3099// Name of the symbol is composed from its index and the function name.
3100// Negative index corresponds to special parameter (unsized array) used for
3101// passing variable arguments.
3102SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3103 EVT v) const {
3104 StringRef SavedStr = nvTM->getStrPool().save(
3106 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3107}
3108
3110 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3111 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3112 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3114 const DataLayout &DL = DAG.getDataLayout();
3115 auto PtrVT = getPointerTy(DAG.getDataLayout());
3116
3117 const Function *F = &MF.getFunction();
3118 const AttributeList &PAL = F->getAttributes();
3119 const TargetLowering *TLI = STI.getTargetLowering();
3120
3121 SDValue Root = DAG.getRoot();
3122 std::vector<SDValue> OutChains;
3123
3124 bool isABI = (STI.getSmVersion() >= 20);
3125 assert(isABI && "Non-ABI compilation is not supported");
3126 if (!isABI)
3127 return Chain;
3128
3129 std::vector<Type *> argTypes;
3130 std::vector<const Argument *> theArgs;
3131 for (const Argument &I : F->args()) {
3132 theArgs.push_back(&I);
3133 argTypes.push_back(I.getType());
3134 }
3135 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3136 // Ins.size() will be larger
3137 // * if there is an aggregate argument with multiple fields (each field
3138 // showing up separately in Ins)
3139 // * if there is a vector argument with more than typical vector-length
3140 // elements (generally if more than 4) where each vector element is
3141 // individually present in Ins.
3142 // So a different index should be used for indexing into Ins.
3143 // See similar issue in LowerCall.
3144 unsigned InsIdx = 0;
3145
3146 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3147 Type *Ty = argTypes[i];
3148
3149 if (theArgs[i]->use_empty()) {
3150 // argument is dead
3151 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3152 SmallVector<EVT, 16> vtparts;
3153
3154 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3155 if (vtparts.empty())
3156 report_fatal_error("Empty parameter types are not supported");
3157
3158 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3159 ++parti) {
3160 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3161 ++InsIdx;
3162 }
3163 if (vtparts.size() > 0)
3164 --InsIdx;
3165 continue;
3166 }
3167 if (Ty->isVectorTy()) {
3168 EVT ObjectVT = getValueType(DL, Ty);
3169 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3170 for (unsigned parti = 0; parti < NumRegs; ++parti) {
3171 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3172 ++InsIdx;
3173 }
3174 if (NumRegs > 0)
3175 --InsIdx;
3176 continue;
3177 }
3178 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3179 continue;
3180 }
3181
3182 // In the following cases, assign a node order of "i+1"
3183 // to newly created nodes. The SDNodes for params have to
3184 // appear in the same order as their order of appearance
3185 // in the original function. "i+1" holds that order.
3186 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3187 bool aggregateIsPacked = false;
3188 if (StructType *STy = dyn_cast<StructType>(Ty))
3189 aggregateIsPacked = STy->isPacked();
3190
3193 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3194 if (VTs.empty())
3195 report_fatal_error("Empty parameter types are not supported");
3196
3197 auto VectorInfo =
3198 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
3199
3200 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3201 int VecIdx = -1; // Index of the first element of the current vector.
3202 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3203 if (VectorInfo[parti] & PVF_FIRST) {
3204 assert(VecIdx == -1 && "Orphaned vector.");
3205 VecIdx = parti;
3206 }
3207
3208 // That's the last element of this store op.
3209 if (VectorInfo[parti] & PVF_LAST) {
3210 unsigned NumElts = parti - VecIdx + 1;
3211 EVT EltVT = VTs[parti];
3212 // i1 is loaded/stored as i8.
3213 EVT LoadVT = EltVT;
3214 if (EltVT == MVT::i1)
3215 LoadVT = MVT::i8;
3216 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3217 // getLoad needs a vector type, but it can't handle
3218 // vectors which contain v2f16 or v2bf16 elements. So we must load
3219 // using i32 here and then bitcast back.
3220 LoadVT = MVT::i32;
3221
3222 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3223 SDValue VecAddr =
3224 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3225 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3227 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3228
3229 const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3230 if (aggregateIsPacked)
3231 return Align(1);
3232 if (NumElts != 1)
3233 return std::nullopt;
3234 Align PartAlign =
3235 (Offsets[parti] == 0 && PAL.getParamAlignment(i))
3236 ? PAL.getParamAlignment(i).value()
3237 : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3238 return commonAlignment(PartAlign, Offsets[parti]);
3239 }();
3240 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3241 MachinePointerInfo(srcValue), PartAlign,
3244 if (P.getNode())
3245 P.getNode()->setIROrder(i + 1);
3246 for (unsigned j = 0; j < NumElts; ++j) {
3247 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3248 DAG.getIntPtrConstant(j, dl));
3249 // We've loaded i1 as an i8 and now must truncate it back to i1
3250 if (EltVT == MVT::i1)
3251 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3252 // v2f16 was loaded as an i32. Now we must bitcast it back.
3253 else if (EltVT != LoadVT)
3254 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3255
3256 // If a promoted integer type is used, truncate down to the original
3257 MVT PromotedVT;
3258 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3259 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3260 }
3261
3262 // Extend the element if necessary (e.g. an i8 is loaded
3263 // into an i16 register)
3264 if (Ins[InsIdx].VT.isInteger() &&
3265 Ins[InsIdx].VT.getFixedSizeInBits() >
3266 LoadVT.getFixedSizeInBits()) {
3267 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3269 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3270 }
3271 InVals.push_back(Elt);
3272 }
3273
3274 // Reset vector tracking state.
3275 VecIdx = -1;
3276 }
3277 ++InsIdx;
3278 }
3279 if (VTs.size() > 0)
3280 --InsIdx;
3281 continue;
3282 }
3283
3284 // Param has ByVal attribute
3285 // Return MoveParam(param symbol).
3286 // Ideally, the param symbol can be returned directly,
3287 // but when SDNode builder decides to use it in a CopyToReg(),
3288 // machine instruction fails because TargetExternalSymbol
3289 // (not lowered) is target dependent, and CopyToReg assumes
3290 // the source is lowered.
3291 EVT ObjectVT = getValueType(DL, Ty);
3292 assert(ObjectVT == Ins[InsIdx].VT &&
3293 "Ins type did not match function type");
3294 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3295 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3296 if (p.getNode())
3297 p.getNode()->setIROrder(i + 1);
3298 InVals.push_back(p);
3299 }
3300
3301 if (!OutChains.empty())
3302 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3303
3304 return Chain;
3305}
3306
3307// Use byte-store when the param adress of the return value is unaligned.
3308// This may happen when the return value is a field of a packed structure.
3310 uint64_t Offset, EVT ElementType,
3311 SDValue RetVal, const SDLoc &dl) {
3312 // Bit logic only works on integer types
3313 if (adjustElementType(ElementType))
3314 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3315
3316 // Store each byte
3317 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3318 // Shift the byte to the last byte position
3319 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3320 DAG.getConstant(i * 8, dl, MVT::i32));
3321 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3322 ShiftVal};
3323 // Trunc store only the last byte by using
3324 // st.param.b8
3325 // The register type can be larger than b8.
3327 DAG.getVTList(MVT::Other), StoreOperands,
3328 MVT::i8, MachinePointerInfo(), std::nullopt,
3330 }
3331 return Chain;
3332}
3333
3334SDValue
3336 bool isVarArg,
3338 const SmallVectorImpl<SDValue> &OutVals,
3339 const SDLoc &dl, SelectionDAG &DAG) const {
3340 const MachineFunction &MF = DAG.getMachineFunction();
3341 const Function &F = MF.getFunction();
3343
3344 bool isABI = (STI.getSmVersion() >= 20);
3345 assert(isABI && "Non-ABI compilation is not supported");
3346 if (!isABI)
3347 return Chain;
3348
3349 const DataLayout &DL = DAG.getDataLayout();
3350 SmallVector<SDValue, 16> PromotedOutVals;
3353 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3354 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3355
3356 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3357 SDValue PromotedOutVal = OutVals[i];
3358 MVT PromotedVT;
3359 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3360 VTs[i] = EVT(PromotedVT);
3361 }
3362 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3364 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3365 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3366 }
3367 PromotedOutVals.push_back(PromotedOutVal);
3368 }
3369
3370 auto VectorInfo = VectorizePTXValueVTs(
3371 VTs, Offsets,
3373 : Align(1));
3374
3375 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3376 // 32-bits are sign extended or zero extended, depending on whether
3377 // they are signed or unsigned types.
3378 bool ExtendIntegerRetVal =
3379 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3380
3381 SmallVector<SDValue, 6> StoreOperands;
3382 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3383 SDValue OutVal = OutVals[i];
3384 SDValue RetVal = PromotedOutVals[i];
3385
3386 if (ExtendIntegerRetVal) {
3387 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3389 dl, MVT::i32, RetVal);
3390 } else if (OutVal.getValueSizeInBits() < 16) {
3391 // Use 16-bit registers for small load-stores as it's the
3392 // smallest general purpose register size supported by NVPTX.
3393 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3394 }
3395
3396 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3397 // for a scalar store. In such cases, fall back to byte stores.
3398 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3399 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3400 Align ElementTypeAlign =
3401 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3402 Align ElementAlign =
3403 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3404 if (ElementAlign < ElementTypeAlign) {
3405 assert(StoreOperands.empty() && "Orphaned operand list.");
3406 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3407 RetVal, dl);
3408
3409 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3410 // into the graph, so just move on to the next element.
3411 continue;
3412 }
3413 }
3414
3415 // New load/store. Record chain and offset operands.
3416 if (VectorInfo[i] & PVF_FIRST) {
3417 assert(StoreOperands.empty() && "Orphaned operand list.");
3418 StoreOperands.push_back(Chain);
3419 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3420 }
3421
3422 // Record the value to return.
3423 StoreOperands.push_back(RetVal);
3424
3425 // That's the last element of this store op.
3426 if (VectorInfo[i] & PVF_LAST) {
3428 unsigned NumElts = StoreOperands.size() - 2;
3429 switch (NumElts) {
3430 case 1:
3432 break;
3433 case 2:
3435 break;
3436 case 4:
3438 break;
3439 default:
3440 llvm_unreachable("Invalid vector info.");
3441 }
3442
3443 // Adjust type of load/store op if we've extended the scalar
3444 // return value.
3445 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3446 Chain = DAG.getMemIntrinsicNode(
3447 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3449 // Cleanup vector state.
3450 StoreOperands.clear();
3451 }
3452 }
3453
3454 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3455}
3456
3458 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3459 SelectionDAG &DAG) const {
3460 if (Constraint.size() > 1)
3461 return;
3462 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3463}
3464
3465static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3466 switch (Intrinsic) {
3467 default:
3468 return 0;
3469
3470 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3472 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3474 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3476 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3478 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3479 return NVPTXISD::Tex1DS32S32;
3480 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3482 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3484 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3486 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3487 return NVPTXISD::Tex1DU32S32;
3488 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3490 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3492 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3494
3495 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3497 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3499 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3501 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3503 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3505 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3507 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3509 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3511 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3513 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3515 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3517 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3519
3520 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3522 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3524 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3526 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3528 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3529 return NVPTXISD::Tex2DS32S32;
3530 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3532 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3534 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3536 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3537 return NVPTXISD::Tex2DU32S32;
3538 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3540 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3542 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3544
3545 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3547 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3549 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3551 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3553 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3555 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3557 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3559 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3561 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3563 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3565 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3567 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3569
3570 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3572 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3574 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3576 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3578 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3579 return NVPTXISD::Tex3DS32S32;
3580 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3582 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3584 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3586 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3587 return NVPTXISD::Tex3DU32S32;
3588 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3590 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3592 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3594
3595 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3597 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3599 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3601 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3603 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3605 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3607
3608 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3610 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3612 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3614 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3616 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3618 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3620
3621 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3623 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3625 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3627 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3629 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3631 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3633 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3635 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3637 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3639 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3641 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3643 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3645
3646 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3648 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3650 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3652 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3654 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3656 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3658 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3660 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3662 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3664 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3666 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3668 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3670
3671 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3673 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3675 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3677 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3679 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3681 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3683 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3685 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3687 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3689 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3691 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3693 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3695
3696 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3698 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3700 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3702 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3704 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3706 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3708 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3710 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3712 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3714 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3716 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3718 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3720
3721 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3723 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3725 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3727 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3729 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3731 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3733 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3735 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3737 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3739 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3741 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3743 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3745
3746 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3748 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3750 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3752 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3754 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3756 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3758 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3760 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3762 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3764 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3766 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3768 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3770
3771 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3773 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3775 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3777 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3779 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3781 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3783
3784 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3786 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3788 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3790 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3792 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3794 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3796
3797 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3799 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3801 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3803 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3805 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3807 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3809
3810 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3812 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3814 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3816 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3818 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3820 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3822 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3824 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3826 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3828 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3830 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3832 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3834 }
3835}
3836
3837static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3838 switch (Intrinsic) {
3839 default:
3840 return 0;
3841 case Intrinsic::nvvm_suld_1d_i8_clamp:
3843 case Intrinsic::nvvm_suld_1d_i16_clamp:
3845 case Intrinsic::nvvm_suld_1d_i32_clamp:
3847 case Intrinsic::nvvm_suld_1d_i64_clamp:
3849 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3851 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3853 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3855 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3857 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3859 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3861 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3863 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3865 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3867 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3869 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3871 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3873 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3875 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3877 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3879 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3881 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3883 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3885 case Intrinsic::nvvm_suld_2d_i8_clamp:
3887 case Intrinsic::nvvm_suld_2d_i16_clamp:
3889 case Intrinsic::nvvm_suld_2d_i32_clamp:
3891 case Intrinsic::nvvm_suld_2d_i64_clamp:
3893 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3895 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3897 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3899 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3901 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3903 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3905 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3907 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3909 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3911 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3913 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3915 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3917 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3919 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3921 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3923 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3925 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3927 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3929 case Intrinsic::nvvm_suld_3d_i8_clamp:
3931 case Intrinsic::nvvm_suld_3d_i16_clamp:
3933 case Intrinsic::nvvm_suld_3d_i32_clamp:
3935 case Intrinsic::nvvm_suld_3d_i64_clamp:
3937 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3939 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3941 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3943 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3945 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3947 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3949 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3951 case Intrinsic::nvvm_suld_1d_i8_trap:
3953 case Intrinsic::nvvm_suld_1d_i16_trap:
3955 case Intrinsic::nvvm_suld_1d_i32_trap:
3957 case Intrinsic::nvvm_suld_1d_i64_trap:
3959 case Intrinsic::nvvm_suld_1d_v2i8_trap:
3961 case Intrinsic::nvvm_suld_1d_v2i16_trap:
3963 case Intrinsic::nvvm_suld_1d_v2i32_trap:
3965 case Intrinsic::nvvm_suld_1d_v2i64_trap:
3967 case Intrinsic::nvvm_suld_1d_v4i8_trap:
3969 case Intrinsic::nvvm_suld_1d_v4i16_trap:
3971 case Intrinsic::nvvm_suld_1d_v4i32_trap:
3973 case Intrinsic::nvvm_suld_1d_array_i8_trap:
3975 case Intrinsic::nvvm_suld_1d_array_i16_trap:
3977 case Intrinsic::nvvm_suld_1d_array_i32_trap:
3979 case Intrinsic::nvvm_suld_1d_array_i64_trap:
3981 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3983 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3985 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3987 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3989 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3991 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3993 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3995 case Intrinsic::nvvm_suld_2d_i8_trap:
3997 case Intrinsic::nvvm_suld_2d_i16_trap:
3999 case Intrinsic::nvvm_suld_2d_i32_trap:
4001 case Intrinsic::nvvm_suld_2d_i64_trap:
4003 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4005 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4007 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4009 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4011 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4013 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4015 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4017 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4019 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4021 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4023 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4025 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4027 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4029 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4031 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4033 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4035 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4037 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4039 case Intrinsic::nvvm_suld_3d_i8_trap:
4041 case Intrinsic::nvvm_suld_3d_i16_trap:
4043 case Intrinsic::nvvm_suld_3d_i32_trap:
4045 case Intrinsic::nvvm_suld_3d_i64_trap:
4047 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4049 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4051 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4053 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4055 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4057 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4059 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4061 case Intrinsic::nvvm_suld_1d_i8_zero:
4063 case Intrinsic::nvvm_suld_1d_i16_zero:
4065 case Intrinsic::nvvm_suld_1d_i32_zero:
4067 case Intrinsic::nvvm_suld_1d_i64_zero:
4069 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4071 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4073 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4075 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4077 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4079 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4081 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4083 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4085 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4087 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4089 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4091 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4093 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4095 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4097 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4099 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4101 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4103 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4105 case Intrinsic::nvvm_suld_2d_i8_zero:
4107 case Intrinsic::nvvm_suld_2d_i16_zero:
4109 case Intrinsic::nvvm_suld_2d_i32_zero:
4111 case Intrinsic::nvvm_suld_2d_i64_zero:
4113 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4115 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4117 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4119 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4121 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4123 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4125 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4127 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4129 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4131 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4133 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4135 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4137 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4139 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4141 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4143 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4145 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4147 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4149 case Intrinsic::nvvm_suld_3d_i8_zero:
4151 case Intrinsic::nvvm_suld_3d_i16_zero:
4153 case Intrinsic::nvvm_suld_3d_i32_zero:
4155 case Intrinsic::nvvm_suld_3d_i64_zero:
4157 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4159 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4161 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4163 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4165 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4167 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4169 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4171 }
4172}
4173
4174// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4175// TgtMemIntrinsic
4176// because we need the information that is only available in the "Value" type
4177// of destination
4178// pointer. In particular, the address space information.
4180 IntrinsicInfo &Info, const CallInst &I,
4181 MachineFunction &MF, unsigned Intrinsic) const {
4182 switch (Intrinsic) {
4183 default:
4184 return false;
4185 case Intrinsic::nvvm_match_all_sync_i32p:
4186 case Intrinsic::nvvm_match_all_sync_i64p:
4188 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4189 // in order to model data exchange with other threads, but perform no real
4190 // memory accesses.
4191 Info.memVT = MVT::i1;
4192
4193 // Our result depends on both our and other thread's arguments.
4195 return true;
4196 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4197 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4198 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4199 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4200 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4201 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4202 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4203 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4204 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4205 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4206 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4207 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4208 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4209 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4210 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4211 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4212 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4213 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4214 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4215 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4216 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4217 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4218 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4219 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4221 Info.memVT = MVT::v8f16;
4222 Info.ptrVal = I.getArgOperand(0);
4223 Info.offset = 0;
4225 Info.align = Align(16);
4226 return true;
4227 }
4228 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4229 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4230 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4231 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4232 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4233 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4234 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4235 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4236 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4237 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4238 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4239 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4240 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4241 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4242 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4243 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4244 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4245 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4246 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4247 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4248 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4249 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4250 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4251 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4253 Info.memVT = MVT::v2i32;
4254 Info.ptrVal = I.getArgOperand(0);
4255 Info.offset = 0;
4257 Info.align = Align(8);
4258 return true;
4259 }
4260
4261 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4262 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4263 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4264 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4265 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4266 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4267 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4268 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4269 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4270 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4271 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4272 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4273 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4274 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4275 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4276 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4277
4278 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4279 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4280 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4281 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4282 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4283 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4284 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4285 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4286 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4287 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4288 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4289 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4290 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4291 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4292 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4293 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4294 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4295 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4297 Info.memVT = MVT::v4i32;
4298 Info.ptrVal = I.getArgOperand(0);
4299 Info.offset = 0;
4301 Info.align = Align(16);
4302 return true;
4303 }
4304
4305 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4306 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4307 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4308 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4309 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4310 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4311 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4312 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4313
4314 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4315 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4316 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4317 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4318 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4319 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4320 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4321 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4322 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4323 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4324 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4325 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4326 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4327 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4328 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4329 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4330 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4331 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4332 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4333 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4334 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4335 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4337 Info.memVT = MVT::i32;
4338 Info.ptrVal = I.getArgOperand(0);
4339 Info.offset = 0;
4341 Info.align = Align(4);
4342 return true;
4343 }
4344
4345 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4346 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4347 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4348 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4349 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4350 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4351 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4352 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4353 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4354 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4355 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4356 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4358 Info.memVT = MVT::v4f16;
4359 Info.ptrVal = I.getArgOperand(0);
4360 Info.offset = 0;
4362 Info.align = Align(16);
4363 return true;
4364 }
4365
4366 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4367 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4368 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4369 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4370 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4371 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4372 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4373 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4374 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4375 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4376 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4377 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4378 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4379 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4380 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4381 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4383 Info.memVT = MVT::v8f32;
4384 Info.ptrVal = I.getArgOperand(0);
4385 Info.offset = 0;
4387 Info.align = Align(16);
4388 return true;
4389 }
4390
4391 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4392 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4393 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4394 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4395
4396 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4397 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4398 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4399 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4400
4401 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4402 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4403 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4404 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4405 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4406 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4407 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4408 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4409 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4410 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4411 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4412 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4414 Info.memVT = MVT::v8i32;
4415 Info.ptrVal = I.getArgOperand(0);
4416 Info.offset = 0;
4418 Info.align = Align(16);
4419 return true;
4420 }
4421
4422 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4423 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4424 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4425 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4426 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4427 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4428 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4429 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4430 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4431 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4433 Info.memVT = MVT::v2i32;
4434 Info.ptrVal = I.getArgOperand(0);
4435 Info.offset = 0;
4437 Info.align = Align(8);
4438 return true;
4439 }
4440
4441 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4442 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4443 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4444 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4445
4446 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4447 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4448 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4449 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4451 Info.memVT = MVT::f64;
4452 Info.ptrVal = I.getArgOperand(0);
4453 Info.offset = 0;
4455 Info.align = Align(8);
4456 return true;
4457 }
4458
4459 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4460 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4461 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4462 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4464 Info.memVT = MVT::v2f64;
4465 Info.ptrVal = I.getArgOperand(0);
4466 Info.offset = 0;
4468 Info.align = Align(16);
4469 return true;
4470 }
4471
4472 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4473 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4474 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4475 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4476 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4477 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4478 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4479 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4480 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4481 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4482 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4483 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4485 Info.memVT = MVT::v4f16;
4486 Info.ptrVal = I.getArgOperand(0);
4487 Info.offset = 0;
4489 Info.align = Align(16);
4490 return true;
4491 }
4492
4493 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4494 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4495 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4496 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4497 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4498 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4499 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4500 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4501 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4502 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4503 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4504 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4505 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4506 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4507 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4508 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4510 Info.memVT = MVT::v8f32;
4511 Info.ptrVal = I.getArgOperand(0);
4512 Info.offset = 0;
4514 Info.align = Align(16);
4515 return true;
4516 }
4517
4518 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4519 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4520 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4521 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4522 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4523 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4524 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4525 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4526 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4527 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4528 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4529 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4531 Info.memVT = MVT::v8i32;
4532 Info.ptrVal = I.getArgOperand(0);
4533 Info.offset = 0;
4535 Info.align = Align(16);
4536 return true;
4537 }
4538
4539 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4540 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4541 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4542 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4543 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4544 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4545 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4546 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4548 Info.memVT = MVT::v2i32;
4549 Info.ptrVal = I.getArgOperand(0);
4550 Info.offset = 0;
4552 Info.align = Align(8);
4553 return true;
4554 }
4555
4556 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4557 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4558 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4559 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4561 Info.memVT = MVT::v2f64;
4562 Info.ptrVal = I.getArgOperand(0);
4563 Info.offset = 0;
4565 Info.align = Align(16);
4566 return true;
4567 }
4568
4569 case Intrinsic::nvvm_atomic_load_inc_32:
4570 case Intrinsic::nvvm_atomic_load_dec_32:
4571
4572 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4573 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4574 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4575 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4576 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4577 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4578 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4579 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4580 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4581 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4582 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4583 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4584 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4585 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4586 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4587 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4588 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4589 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4590 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4591 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4592 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4593 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4594 auto &DL = I.getModule()->getDataLayout();
4596 Info.memVT = getValueType(DL, I.getType());
4597 Info.ptrVal = I.getArgOperand(0);
4598 Info.offset = 0;
4600 Info.align.reset();
4601 return true;
4602 }
4603
4604 case Intrinsic::nvvm_ldu_global_i:
4605 case Intrinsic::nvvm_ldu_global_f:
4606 case Intrinsic::nvvm_ldu_global_p: {
4607 auto &DL = I.getModule()->getDataLayout();
4609 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4610 Info.memVT = getValueType(DL, I.getType());
4611 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4612 Info.memVT = getPointerTy(DL);
4613 else
4614 Info.memVT = getValueType(DL, I.getType());
4615 Info.ptrVal = I.getArgOperand(0);
4616 Info.offset = 0;
4618 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4619
4620 return true;
4621 }
4622 case Intrinsic::nvvm_ldg_global_i:
4623 case Intrinsic::nvvm_ldg_global_f:
4624 case Intrinsic::nvvm_ldg_global_p: {
4625 auto &DL = I.getModule()->getDataLayout();
4626
4628 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4629 Info.memVT = getValueType(DL, I.getType());
4630 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4631 Info.memVT = getPointerTy(DL);
4632 else
4633 Info.memVT = getValueType(DL, I.getType());
4634 Info.ptrVal = I.getArgOperand(0);
4635 Info.offset = 0;
4637 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4638
4639 return true;
4640 }
4641
4642 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4643 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4644 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4645 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4646 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4647 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4648 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4649 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4650 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4651 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4652 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4653 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4654 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4655 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4656 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4657 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4658 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4659 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4660 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4661 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4662 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4663 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4664 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4665 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4666 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4667 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4668 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4669 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4670 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4671 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4672 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4673 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4674 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4675 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4676 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4677 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4678 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4679 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4680 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4681 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4682 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4683 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4684 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4685 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4686 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4687 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4688 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4689 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4690 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4691 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4692 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4693 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4694 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4695 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4696 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4697 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4698 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4699 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4700 Info.opc = getOpcForTextureInstr(Intrinsic);
4701 Info.memVT = MVT::v4f32;
4702 Info.ptrVal = nullptr;
4703 Info.offset = 0;
4705 Info.align = Align(16);
4706 return true;
4707
4708 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4709 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4710 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4711 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4712 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4713 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4714 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4715 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4716 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4717 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4718 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4719 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4720 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4721 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4722 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4723 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4724 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4725 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4726 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4727 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4728 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4729 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4730 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4731 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4732 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4733 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4734 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4735 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4736 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4737 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4738 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4739 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4740 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4741 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4742 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4743 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4744 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4745 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4746 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4747 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4748 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4749 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4750 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4751 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4752 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4753 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4754 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4755 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4756 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4757 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4758 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4759 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4760 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4761 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4762 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4763 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4764 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4765 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4766 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4767 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4768 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4769 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4770 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4771 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4772 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4773 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4774 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4775 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4776 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4777 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4778 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4779 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4780 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4781 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4782 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4783 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4784 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4785 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4786 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4787 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4788 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4789 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4790 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4791 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4792 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4793 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4794 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4795 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4796 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4797 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4798 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4799 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4800 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4801 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4802 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4803 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4804 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4805 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4806 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4807 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4808 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4809 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4810 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4811 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4812 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4813 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4814 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4815 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4816 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4817 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4818 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4819 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4820 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4821 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4822 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4823 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4824 Info.opc = getOpcForTextureInstr(Intrinsic);
4825 Info.memVT = MVT::v4i32;
4826 Info.ptrVal = nullptr;
4827 Info.offset = 0;
4829 Info.align = Align(16);
4830 return true;
4831
4832 case Intrinsic::nvvm_suld_1d_i8_clamp:
4833 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4834 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4835 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4836 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4837 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4838 case Intrinsic::nvvm_suld_2d_i8_clamp:
4839 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4840 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4841 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4842 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4843 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4844 case Intrinsic::nvvm_suld_3d_i8_clamp:
4845 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4846 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4847 case Intrinsic::nvvm_suld_1d_i8_trap:
4848 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4849 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4850 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4851 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4852 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4853 case Intrinsic::nvvm_suld_2d_i8_trap:
4854 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4855 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4856 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4857 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4858 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4859 case Intrinsic::nvvm_suld_3d_i8_trap:
4860 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4861 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4862 case Intrinsic::nvvm_suld_1d_i8_zero:
4863 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4864 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4865 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4866 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4867 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4868 case Intrinsic::nvvm_suld_2d_i8_zero:
4869 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4870 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4871 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4872 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4873 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4874 case Intrinsic::nvvm_suld_3d_i8_zero:
4875 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4876 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4877 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4878 Info.memVT = MVT::i8;
4879 Info.ptrVal = nullptr;
4880 Info.offset = 0;
4882 Info.align = Align(16);
4883 return true;
4884
4885 case Intrinsic::nvvm_suld_1d_i16_clamp:
4886 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4887 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4888 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4889 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4890 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4891 case Intrinsic::nvvm_suld_2d_i16_clamp:
4892 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4893 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4894 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4895 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4896 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4897 case Intrinsic::nvvm_suld_3d_i16_clamp:
4898 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4899 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4900 case Intrinsic::nvvm_suld_1d_i16_trap:
4901 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4902 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4903 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4904 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4905 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4906 case Intrinsic::nvvm_suld_2d_i16_trap:
4907 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4908 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4909 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4910 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4911 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4912 case Intrinsic::nvvm_suld_3d_i16_trap:
4913 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4914 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4915 case Intrinsic::nvvm_suld_1d_i16_zero:
4916 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4917 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4918 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4919 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4920 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4921 case Intrinsic::nvvm_suld_2d_i16_zero:
4922 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4923 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4924 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4925 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4926 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4927 case Intrinsic::nvvm_suld_3d_i16_zero:
4928 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4929 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4930 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4931 Info.memVT = MVT::i16;
4932 Info.ptrVal = nullptr;
4933 Info.offset = 0;
4935 Info.align = Align(16);
4936 return true;
4937
4938 case Intrinsic::nvvm_suld_1d_i32_clamp:
4939 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4940 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4941 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4942 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4943 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4944 case Intrinsic::nvvm_suld_2d_i32_clamp:
4945 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4946 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4947 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4948 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4949 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4950 case Intrinsic::nvvm_suld_3d_i32_clamp:
4951 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4952 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4953 case Intrinsic::nvvm_suld_1d_i32_trap:
4954 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4955 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4956 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4957 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4958 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4959 case Intrinsic::nvvm_suld_2d_i32_trap:
4960 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4961 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4962 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4963 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4964 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4965 case Intrinsic::nvvm_suld_3d_i32_trap:
4966 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4967 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4968 case Intrinsic::nvvm_suld_1d_i32_zero:
4969 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4970 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4971 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4972 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4973 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4974 case Intrinsic::nvvm_suld_2d_i32_zero:
4975 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4976 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4977 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4978 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4979 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4980 case Intrinsic::nvvm_suld_3d_i32_zero:
4981 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4982 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4983 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4984 Info.memVT = MVT::i32;
4985 Info.ptrVal = nullptr;
4986 Info.offset = 0;
4988 Info.align = Align(16);
4989 return true;
4990
4991 case Intrinsic::nvvm_suld_1d_i64_clamp:
4992 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4993 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4994 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4995 case Intrinsic::nvvm_suld_2d_i64_clamp:
4996 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4997 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4998 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4999 case Intrinsic::nvvm_suld_3d_i64_clamp:
5000 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5001 case Intrinsic::nvvm_suld_1d_i64_trap:
5002 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5003 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5004 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5005 case Intrinsic::nvvm_suld_2d_i64_trap:
5006 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5007 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5008 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5009 case Intrinsic::nvvm_suld_3d_i64_trap:
5010 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5011 case Intrinsic::nvvm_suld_1d_i64_zero:
5012 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5013 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5014 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5015 case Intrinsic::nvvm_suld_2d_i64_zero:
5016 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5017 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5018 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5019 case Intrinsic::nvvm_suld_3d_i64_zero:
5020 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5021 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5022 Info.memVT = MVT::i64;
5023 Info.ptrVal = nullptr;
5024 Info.offset = 0;
5026 Info.align = Align(16);
5027 return true;
5028 }
5029 return false;
5030}
5031
5032/// getFunctionParamOptimizedAlign - since function arguments are passed via
5033/// .param space, we may want to increase their alignment in a way that
5034/// ensures that we can effectively vectorize their loads & stores. We can
5035/// increase alignment only if the function has internal or has private
5036/// linkage as for other linkage types callers may already rely on default
5037/// alignment. To allow using 128-bit vectorized loads/stores, this function
5038/// ensures that alignment is 16 or greater.
5040 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5041 const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
5042
5043 // If a function has linkage different from internal or private, we
5044 // must use default ABI alignment as external users rely on it. Same
5045 // for a function that may be called from a function pointer.
5046 if (!F || !F->hasLocalLinkage() ||
5047 F->hasAddressTaken(/*Users=*/nullptr,
5048 /*IgnoreCallbackUses=*/false,
5049 /*IgnoreAssumeLikeCalls=*/true,
5050 /*IgnoreLLVMUsed=*/true))
5051 return Align(ABITypeAlign);
5052
5053 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5054 return Align(std::max(uint64_t(16), ABITypeAlign));
5055}
5056
5057/// Helper for computing alignment of a device function byval parameter.
5059 const Function *F, Type *ArgTy, Align InitialAlign,
5060 const DataLayout &DL) const {
5061 Align ArgAlign = InitialAlign;
5062 // Try to increase alignment to enhance vectorization options.
5063 if (F)
5064 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5065
5066 // Old ptx versions have a bug. When PTX code takes address of
5067 // byval parameter with alignment < 4, ptxas generates code to
5068 // spill argument into memory. Alas on sm_50+ ptxas generates
5069 // SASS code that fails with misaligned access. To work around
5070 // the problem, make sure that we align byval parameters by at
5071 // least 4. This bug seems to be fixed at least starting from
5072 // ptxas > 9.0.
5073 // TODO: remove this after verifying the bug is not reproduced
5074 // on non-deprecated ptxas versions.
5076 ArgAlign = std::max(ArgAlign, Align(4));
5077
5078 return ArgAlign;
5079}
5080
5081// Helper for getting a function parameter name. Name is composed from
5082// its index and the function name. Negative index corresponds to special
5083// parameter (unsized array) used for passing variable arguments.
5085 int Idx) const {
5086 std::string ParamName;
5087 raw_string_ostream ParamStr(ParamName);
5088
5089 ParamStr << getTargetMachine().getSymbol(F)->getName();
5090 if (Idx < 0)
5091 ParamStr << "_vararg";
5092 else
5093 ParamStr << "_param_" << Idx;
5094
5095 return ParamName;
5096}
5097
5098/// isLegalAddressingMode - Return true if the addressing mode represented
5099/// by AM is legal for this target, for a load/store of the specified type.
5100/// Used to guide target specific optimizations, like loop strength reduction
5101/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5102/// (CodeGenPrepare.cpp)
5104 const AddrMode &AM, Type *Ty,
5105 unsigned AS, Instruction *I) const {
5106 // AddrMode - This represents an addressing mode of:
5107 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5108 //
5109 // The legal address modes are
5110 // - [avar]
5111 // - [areg]
5112 // - [areg+immoff]
5113 // - [immAddr]
5114
5115 if (AM.BaseGV) {
5116 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5117 }
5118
5119 switch (AM.Scale) {
5120 case 0: // "r", "r+i" or "i" is allowed
5121 break;
5122 case 1:
5123 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5124 return false;
5125 // Otherwise we have r+i.
5126 break;
5127 default:
5128 // No scale > 1 is allowed
5129 return false;
5130 }
5131 return true;
5132}
5133
5134//===----------------------------------------------------------------------===//
5135// NVPTX Inline Assembly Support
5136//===----------------------------------------------------------------------===//
5137
5138/// getConstraintType - Given a constraint letter, return the type of
5139/// constraint it is for this target.
5142 if (Constraint.size() == 1) {
5143 switch (Constraint[0]) {
5144 default:
5145 break;
5146 case 'b':
5147 case 'r':
5148 case 'h':
5149 case 'c':
5150 case 'l':
5151 case 'f':
5152 case 'd':
5153 case '0':
5154 case 'N':
5155 return C_RegisterClass;
5156 }
5157 }
5158 return TargetLowering::getConstraintType(Constraint);
5159}
5160
5161std::pair<unsigned, const TargetRegisterClass *>
5163 StringRef Constraint,
5164 MVT VT) const {
5165 if (Constraint.size() == 1) {
5166 switch (Constraint[0]) {
5167 case 'b':
5168 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5169 case 'c':
5170 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5171 case 'h':
5172 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5173 case 'r':
5174 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5175 case 'l':
5176 case 'N':
5177 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5178 case 'f':
5179 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5180 case 'd':
5181 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5182 }
5183 }
5184 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5185}
5186
5187//===----------------------------------------------------------------------===//
5188// NVPTX DAG Combining
5189//===----------------------------------------------------------------------===//
5190
5192 CodeGenOptLevel OptLevel) const {
5193 // Always honor command-line argument
5194 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5195 return FMAContractLevelOpt > 0;
5196
5197 // Do not contract if we're not optimizing the code.
5198 if (OptLevel == CodeGenOptLevel::None)
5199 return false;
5200
5201 // Honor TargetOptions flags that explicitly say fusion is okay.
5203 return true;
5204
5205 return allowUnsafeFPMath(MF);
5206}
5207
5209 // Honor TargetOptions flags that explicitly say unsafe math is okay.
5211 return true;
5212
5213 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5214 const Function &F = MF.getFunction();
5215 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5216}
5217
5218/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5219/// operands N0 and N1. This is a helper for PerformADDCombine that is
5220/// called with the default operands, and if that fails, with commuted
5221/// operands.
5224 const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
5225 SelectionDAG &DAG = DCI.DAG;
5226 // Skip non-integer, non-scalar case
5227 EVT VT=N0.getValueType();
5228 if (VT.isVector())
5229 return SDValue();
5230
5231 // fold (add (mul a, b), c) -> (mad a, b, c)
5232 //
5233 if (N0.getOpcode() == ISD::MUL) {
5234 assert (VT.isInteger());
5235 // For integer:
5236 // Since integer multiply-add costs the same as integer multiply
5237 // but is more costly than integer add, do the fusion only when
5238 // the mul is only used in the add.
5239 if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
5240 !N0.getNode()->hasOneUse())
5241 return SDValue();
5242
5243 // Do the folding
5244 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5245 N0.getOperand(0), N0.getOperand(1), N1);
5246 }
5247 else if (N0.getOpcode() == ISD::FMUL) {
5248 if (VT == MVT::f32 || VT == MVT::f64) {
5249 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5250 &DAG.getTargetLoweringInfo());
5251 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
5252 return SDValue();
5253
5254 // For floating point:
5255 // Do the fusion only when the mul has less than 5 uses and all
5256 // are add.
5257 // The heuristic is that if a use is not an add, then that use
5258 // cannot be fused into fma, therefore mul is still needed anyway.
5259 // If there are more than 4 uses, even if they are all add, fusing
5260 // them will increase register pressue.
5261 //
5262 int numUses = 0;
5263 int nonAddCount = 0;
5264 for (const SDNode *User : N0.getNode()->uses()) {
5265 numUses++;
5266 if (User->getOpcode() != ISD::FADD)
5267 ++nonAddCount;
5268 }
5269 if (numUses >= 5)
5270 return SDValue();
5271 if (nonAddCount) {
5272 int orderNo = N->getIROrder();
5273 int orderNo2 = N0.getNode()->getIROrder();
5274 // simple heuristics here for considering potential register
5275 // pressure, the logics here is that the differnce are used
5276 // to measure the distance between def and use, the longer distance
5277 // more likely cause register pressure.
5278 if (orderNo - orderNo2 < 500)
5279 return SDValue();
5280
5281 // Now, check if at least one of the FMUL's operands is live beyond the node N,
5282 // which guarantees that the FMA will not increase register pressure at node N.
5283 bool opIsLive = false;
5284 const SDNode *left = N0.getOperand(0).getNode();
5285 const SDNode *right = N0.getOperand(1).getNode();
5286
5287 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5288 opIsLive = true;
5289
5290 if (!opIsLive)
5291 for (const SDNode *User : left->uses()) {
5292 int orderNo3 = User->getIROrder();
5293 if (orderNo3 > orderNo) {
5294 opIsLive = true;
5295 break;
5296 }
5297 }
5298
5299 if (!opIsLive)
5300 for (const SDNode *User : right->uses()) {
5301 int orderNo3 = User->getIROrder();
5302 if (orderNo3 > orderNo) {
5303 opIsLive = true;
5304 break;
5305 }
5306 }
5307
5308 if (!opIsLive)
5309 return SDValue();
5310 }
5311
5312 return DAG.getNode(ISD::FMA, SDLoc(N), VT,
5313 N0.getOperand(0), N0.getOperand(1), N1);
5314 }
5315 }
5316
5317 return SDValue();
5318}
5319
5321 // Operands from the 2nd to the last one are the values to be stored
5322 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
5323 if (!N->getOperand(I).isUndef())
5324 return SDValue();
5325
5326 // Operand 0 is the previous value in the chain. Cannot return EntryToken
5327 // as the previous value will become unused and eliminated later.
5328 return N->getOperand(0);
5329}
5330
5331/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5332///
5335 const NVPTXSubtarget &Subtarget,
5336 CodeGenOptLevel OptLevel) {
5337 SDValue N0 = N->getOperand(0);
5338 SDValue N1 = N->getOperand(1);
5339
5340 // First try with the default operand order.
5341 if (SDValue Result =
5342 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
5343 return Result;
5344
5345 // If that didn't work, try again with the operands commuted.
5346 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
5347}
5348
5351 // The type legalizer turns a vector load of i8 values into a zextload to i16
5352 // registers, optionally ANY_EXTENDs it (if target type is integer),
5353 // and ANDs off the high 8 bits. Since we turn this load into a
5354 // target-specific DAG node, the DAG combiner fails to eliminate these AND
5355 // nodes. Do that here.
5356 SDValue Val = N->getOperand(0);
5357 SDValue Mask = N->getOperand(1);
5358
5359 if (isa<ConstantSDNode>(Val)) {
5360 std::swap(Val, Mask);
5361 }
5362
5363 SDValue AExt;
5364
5365 // Convert BFE-> truncate i16 -> and 255
5366 // To just BFE-> truncate i16, as the value already has all the bits in the
5367 // right places.
5368 if (Val.getOpcode() == ISD::TRUNCATE) {
5369 SDValue BFE = Val.getOperand(0);
5370 if (BFE.getOpcode() != NVPTXISD::BFE)
5371 return SDValue();
5372
5373 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5374 if (!BFEBits)
5375 return SDValue();
5376 uint64_t BFEBitsVal = BFEBits->getZExtValue();
5377
5378 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5379 if (!MaskCnst) {
5380 // Not an AND with a constant
5381 return SDValue();
5382 }
5383 uint64_t MaskVal = MaskCnst->getZExtValue();
5384
5385 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5386 return SDValue();
5387 // If we get here, the AND is unnecessary. Just replace it with the trunc
5388 DCI.CombineTo(N, Val, false);
5389 }
5390 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5391 if (Val.getOpcode() == ISD::ANY_EXTEND) {
5392 AExt = Val;
5393 Val = Val->getOperand(0);
5394 }
5395
5396 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5397 Val = Val->getOperand(0);
5398 }
5399
5400 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5401 Val->getOpcode() == NVPTXISD::LoadV4) {
5402 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5403 if (!MaskCnst) {
5404 // Not an AND with a constant
5405 return SDValue();
5406 }
5407
5408 uint64_t MaskVal = MaskCnst->getZExtValue();
5409 if (MaskVal != 0xff) {
5410 // Not an AND that chops off top 8 bits
5411 return SDValue();
5412 }
5413
5414 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5415 if (!Mem) {
5416 // Not a MemSDNode?!?
5417 return SDValue();
5418 }
5419
5420 EVT MemVT = Mem->getMemoryVT();
5421 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5422 // We only handle the i8 case
5423 return SDValue();
5424 }
5425
5426 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5427 if (ExtType == ISD::SEXTLOAD) {
5428 // If for some reason the load is a sextload, the and is needed to zero
5429 // out the high 8 bits
5430 return SDValue();
5431 }
5432
5433 bool AddTo = false;
5434 if (AExt.getNode() != nullptr) {
5435 // Re-insert the ext as a zext.
5436 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5437 AExt.getValueType(), Val);
5438 AddTo = true;
5439 }
5440
5441 // If we get here, the AND is unnecessary. Just replace it with the load
5442 DCI.CombineTo(N, Val, AddTo);
5443 }
5444
5445 return SDValue();
5446}
5447
5450 CodeGenOptLevel OptLevel) {
5451 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5452
5453 // Don't do anything at less than -O2.
5454 if (OptLevel < CodeGenOptLevel::Default)
5455 return SDValue();
5456
5457 SelectionDAG &DAG = DCI.DAG;
5458 SDLoc DL(N);
5459 EVT VT = N->getValueType(0);
5460 bool IsSigned = N->getOpcode() == ISD::SREM;
5461 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5462
5463 const SDValue &Num = N->getOperand(0);
5464 const SDValue &Den = N->getOperand(1);
5465
5466 for (const SDNode *U : Num->uses()) {
5467 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5468 U->getOperand(1) == Den) {
5469 // Num % Den -> Num - (Num / Den) * Den
5470 return DAG.getNode(ISD::SUB, DL, VT, Num,
5471 DAG.getNode(ISD::MUL, DL, VT,
5472 DAG.getNode(DivOpc, DL, VT, Num, Den),
5473 Den));
5474 }
5475 }
5476 return SDValue();
5477}
5478
5482 Unknown
5484
5485/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5486/// that can be demoted to \p OptSize bits without loss of information. The
5487/// signedness of the operand, if determinable, is placed in \p S.
5489 unsigned OptSize,
5490 OperandSignedness &S) {
5491 S = Unknown;
5492
5493 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5494 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5495 EVT OrigVT = Op.getOperand(0).getValueType();
5496 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5497 S = Signed;
5498 return true;
5499 }
5500 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5501 EVT OrigVT = Op.getOperand(0).getValueType();
5502 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5503 S = Unsigned;
5504 return true;
5505 }
5506 }
5507
5508 return false;
5509}
5510
5511/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5512/// be demoted to \p OptSize bits without loss of information. If the operands
5513/// contain a constant, it should appear as the RHS operand. The signedness of
5514/// the operands is placed in \p IsSigned.
5516 unsigned OptSize,
5517 bool &IsSigned) {
5518 OperandSignedness LHSSign;
5519
5520 // The LHS operand must be a demotable op
5521 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5522 return false;
5523
5524 // We should have been able to determine the signedness from the LHS
5525 if (LHSSign == Unknown)
5526 return false;
5527
5528 IsSigned = (LHSSign == Signed);
5529
5530 // The RHS can be a demotable op or a constant
5531 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5532 const APInt &Val = CI->getAPIntValue();
5533 if (LHSSign == Unsigned) {
5534 return Val.isIntN(OptSize);
5535 } else {
5536 return Val.isSignedIntN(OptSize);
5537 }
5538 } else {
5539 OperandSignedness RHSSign;
5540 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5541 return false;
5542
5543 return LHSSign == RHSSign;
5544 }
5545}
5546
5547/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5548/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5549/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5550/// amount.
5553 EVT MulType = N->getValueType(0);
5554 if (MulType != MVT::i32 && MulType != MVT::i64) {
5555 return SDValue();
5556 }
5557
5558 SDLoc DL(N);
5559 unsigned OptSize = MulType.getSizeInBits() >> 1;
5560 SDValue LHS = N->getOperand(0);
5561 SDValue RHS = N->getOperand(1);
5562
5563 // Canonicalize the multiply so the constant (if any) is on the right
5564 if (N->getOpcode() == ISD::MUL) {
5565 if (isa<ConstantSDNode>(LHS)) {
5566 std::swap(LHS, RHS);
5567 }
5568 }
5569
5570 // If we have a SHL, determine the actual multiply amount
5571 if (N->getOpcode() == ISD::SHL) {
5572 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5573 if (!ShlRHS) {
5574 return SDValue();
5575 }
5576
5577 APInt ShiftAmt = ShlRHS->getAPIntValue();
5578 unsigned BitWidth = MulType.getSizeInBits();
5579 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5580 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5581 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5582 } else {
5583 return SDValue();
5584 }
5585 }
5586
5587 bool Signed;
5588 // Verify that our operands are demotable
5589 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5590 return SDValue();
5591 }
5592
5593 EVT DemotedVT;
5594 if (MulType == MVT::i32) {
5595 DemotedVT = MVT::i16;
5596 } else {
5597 DemotedVT = MVT::i32;
5598 }
5599
5600 // Truncate the operands to the correct size. Note that these are just for
5601 // type consistency and will (likely) be eliminated in later phases.
5602 SDValue TruncLHS =
5603 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5604 SDValue TruncRHS =
5605 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5606
5607 unsigned Opc;
5608 if (Signed) {
5610 } else {
5612 }
5613
5614 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5615}
5616
5617/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5620 CodeGenOptLevel OptLevel) {
5621 if (OptLevel > CodeGenOptLevel::None) {
5622 // Try mul.wide combining at OptLevel > 0
5623 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5624 return Ret;
5625 }
5626
5627 return SDValue();
5628}
5629
5630/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5633 CodeGenOptLevel OptLevel) {
5634 if (OptLevel > CodeGenOptLevel::None) {
5635 // Try mul.wide combining at OptLevel > 0
5636 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5637 return Ret;
5638 }
5639
5640 return SDValue();
5641}
5642
5645 unsigned int SmVersion) {
5646 EVT CCType = N->getValueType(0);
5647 SDValue A = N->getOperand(0);
5648 SDValue B = N->getOperand(1);
5649
5650 EVT AType = A.getValueType();
5651 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5652 return SDValue();
5653
5654 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5655 return SDValue();
5656
5657 SDLoc DL(N);
5658 // setp.f16x2 returns two scalar predicates, which we need to
5659 // convert back to v2i1. The returned result will be scalarized by
5660 // the legalizer, but the comparison will remain a single vector
5661 // instruction.
5662 SDValue CCNode = DCI.DAG.getNode(
5663 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5665 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5666 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5667 CCNode.getValue(1));
5668}
5669
5672 SDValue Vector = N->getOperand(0);
5673 SDLoc DL(N);
5674 EVT VectorVT = Vector.getValueType();
5675 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5676 IsPTXVectorType(VectorVT.getSimpleVT()))
5677 return SDValue(); // Native vector loads already combine nicely w/
5678 // extract_vector_elt.
5679 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5680 // handle them OK.
5681 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5682 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5683 return SDValue();
5684
5685 uint64_t VectorBits = VectorVT.getSizeInBits();
5686 // We only handle the types we can extract in-register.
5687 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5688 return SDValue();
5689
5690 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5691 // Index == 0 is handled by generic DAG combiner.
5692 if (!Index || Index->getZExtValue() == 0)
5693 return SDValue();
5694
5695 MVT IVT = MVT::getIntegerVT(VectorBits);
5696 EVT EltVT = VectorVT.getVectorElementType();
5697 EVT EltIVT = EltVT.changeTypeToInteger();
5698 uint64_t EltBits = EltVT.getScalarSizeInBits();
5699
5700 SDValue Result = DCI.DAG.getNode(
5701 ISD::TRUNCATE, DL, EltIVT,
5702 DCI.DAG.getNode(
5703 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5704 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5705
5706 // If element has non-integer type, bitcast it back to the expected type.
5707 if (EltVT != EltIVT)
5708 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5709 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5710 if (EltVT != N->getValueType(0))
5711 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5712
5713 return Result;
5714}
5715
5718 SDValue VA = N->getOperand(1);
5719 EVT VectorVT = VA.getValueType();
5720 if (VectorVT != MVT::v4i8)
5721 return SDValue();
5722
5723 // We need to split vselect into individual per-element operations Because we
5724 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5725 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5726 // to/from i16 normally used for i8 values.
5728 SDLoc DL(N);
5729 SDValue VCond = N->getOperand(0);
5730 SDValue VB = N->getOperand(2);
5731 for (int I = 0; I < 4; ++I) {
5732 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5733 DCI.DAG.getConstant(I, DL, MVT::i32));
5734 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5735 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5736 DCI.DAG.getConstant(I, DL, MVT::i32)),
5737 DL, MVT::i32);
5738 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5739 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5740 DCI.DAG.getConstant(I, DL, MVT::i32)),
5741 DL, MVT::i32);
5743 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5744 }
5745 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5746}
5747
5750 SelectionDAG &DAG = DCI.DAG;
5751 LoadSDNode *LD = cast<LoadSDNode>(N);
5752
5753 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5754 // letting ReplaceLoadVector split it into smaller loads during legalization.
5755 // This is done at dag-combine1 time, so that vector operations with i8
5756 // elements can be optimised away instead of being needlessly split during
5757 // legalization, which involves storing to the stack and loading it back.
5758 EVT VT = N->getValueType(0);
5759 if (VT != MVT::v16i8)
5760 return SDValue();
5761
5762 SDLoc DL(N);
5763
5764 // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5765 unsigned Opc = NVPTXISD::LoadV4;
5766 EVT NewVT = MVT::v4i32;
5767 EVT EltVT = NewVT.getVectorElementType();
5768 unsigned NumElts = NewVT.getVectorNumElements();
5769 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5770 SDVTList RetVTList = DAG.getVTList(RetVTs);
5771 SmallVector<SDValue, 8> Ops(N->ops());
5772 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5773 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5774 LD->getMemOperand());
5775 SDValue NewChain = NewLoad.getValue(NumElts);
5776
5777 // Create a vector of the same type returned by the original load.
5779 for (unsigned i = 0; i < NumElts; i++)
5780 Elts.push_back(NewLoad.getValue(i));
5781 return DCI.DAG.getMergeValues(
5782 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
5783 NewChain},
5784 DL);
5785}
5786
5787SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5788 DAGCombinerInfo &DCI) const {
5790 switch (N->getOpcode()) {
5791 default: break;
5792 case ISD::ADD:
5793 case ISD::FADD:
5794 return PerformADDCombine(N, DCI, STI, OptLevel);
5795 case ISD::MUL:
5796 return PerformMULCombine(N, DCI, OptLevel);
5797 case ISD::SHL:
5798 return PerformSHLCombine(N, DCI, OptLevel);
5799 case ISD::AND:
5800 return PerformANDCombine(N, DCI);
5801 case ISD::UREM:
5802 case ISD::SREM:
5803 return PerformREMCombine(N, DCI, OptLevel);
5804 case ISD::SETCC:
5805 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5806 case ISD::LOAD:
5807 return PerformLOADCombine(N, DCI);
5813 return PerformEXTRACTCombine(N, DCI);
5814 case ISD::VSELECT:
5815 return PerformVSELECTCombine(N, DCI);
5816 }
5817 return SDValue();
5818}
5819
5820/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5823 EVT ResVT = N->getValueType(0);
5824 SDLoc DL(N);
5825
5826 assert(ResVT.isVector() && "Vector load must have vector type");
5827
5828 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
5829 // legal. We can (and should) split that into 2 loads of <2 x double> here
5830 // but I'm leaving that as a TODO for now.
5831 assert(ResVT.isSimple() && "Can only handle simple types");
5832 switch (ResVT.getSimpleVT().SimpleTy) {
5833 default:
5834 return;
5835 case MVT::v2i8:
5836 case MVT::v2i16:
5837 case MVT::v2i32:
5838 case MVT::v2i64:
5839 case MVT::v2f16:
5840 case MVT::v2f32:
5841 case MVT::v2f64:
5842 case MVT::v4i8:
5843 case MVT::v4i16:
5844 case MVT::v4i32:
5845 case MVT::v4f16:
5846 case MVT::v4f32:
5847 case MVT::v8f16: // <4 x f16x2>
5848 case MVT::v8bf16: // <4 x bf16x2>
5849 case MVT::v8i16: // <4 x i16x2>
5850 // This is a "native" vector type
5851 break;
5852 }
5853
5854 LoadSDNode *LD = cast<LoadSDNode>(N);
5855
5856 Align Alignment = LD->getAlign();
5857 auto &TD = DAG.getDataLayout();
5858 Align PrefAlign =
5859 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5860 if (Alignment < PrefAlign) {
5861 // This load is not sufficiently aligned, so bail out and let this vector
5862 // load be scalarized. Note that we may still be able to emit smaller
5863 // vector loads. For example, if we are loading a <4 x float> with an
5864 // alignment of 8, this check will fail but the legalizer will try again
5865 // with 2 x <2 x float>, which will succeed with an alignment of 8.
5866 return;
5867 }
5868
5869 EVT EltVT = ResVT.getVectorElementType();
5870 unsigned NumElts = ResVT.getVectorNumElements();
5871
5872 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5873 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5874 // loaded type to i16 and propagate the "real" type as the memory type.
5875 bool NeedTrunc = false;
5876 if (EltVT.getSizeInBits() < 16) {
5877 EltVT = MVT::i16;
5878 NeedTrunc = true;
5879 }
5880
5881 unsigned Opcode = 0;
5882 SDVTList LdResVTs;
5883 bool Load16x2 = false;
5884
5885 switch (NumElts) {
5886 default:
5887 return;
5888 case 2:
5889 Opcode = NVPTXISD::LoadV2;
5890 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5891 break;
5892 case 4: {
5893 Opcode = NVPTXISD::LoadV4;
5894 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5895 LdResVTs = DAG.getVTList(ListVTs);
5896 break;
5897 }
5898 case 8: {
5899 // v8f16 is a special case. PTX doesn't have ld.v8.f16
5900 // instruction. Instead, we split the vector into v2f16 chunks and
5901 // load them with ld.v4.b32.
5902 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
5903 Load16x2 = true;
5904 Opcode = NVPTXISD::LoadV4;
5905 EVT VVT;
5906 switch (EltVT.getSimpleVT().SimpleTy) {
5907 case MVT::f16:
5908 VVT = MVT::v2f16;
5909 break;
5910 case MVT::bf16:
5911 VVT = MVT::v2bf16;
5912 break;
5913 case MVT::i16:
5914 VVT = MVT::v2i16;
5915 break;
5916 default:
5917 llvm_unreachable("Unsupported v8 vector type.");
5918 }
5919 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
5920 LdResVTs = DAG.getVTList(ListVTs);
5921 break;
5922 }
5923 }
5924
5925 // Copy regular operands
5926 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
5927
5928 // The select routine does not have access to the LoadSDNode instance, so
5929 // pass along the extension information
5930 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5931
5932 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5933 LD->getMemoryVT(),
5934 LD->getMemOperand());
5935
5936 SmallVector<SDValue, 8> ScalarRes;
5937 if (Load16x2) {
5938 // Split v2f16 subvectors back into individual elements.
5939 NumElts /= 2;
5940 for (unsigned i = 0; i < NumElts; ++i) {
5941 SDValue SubVector = NewLD.getValue(i);
5942 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5943 DAG.getIntPtrConstant(0, DL));
5944 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5945 DAG.getIntPtrConstant(1, DL));
5946 ScalarRes.push_back(E0);
5947 ScalarRes.push_back(E1);
5948 }
5949 } else {
5950 for (unsigned i = 0; i < NumElts; ++i) {
5951 SDValue Res = NewLD.getValue(i);
5952 if (NeedTrunc)
5953 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5954 ScalarRes.push_back(Res);
5955 }
5956 }
5957
5958 SDValue LoadChain = NewLD.getValue(NumElts);
5959
5960 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
5961
5962 Results.push_back(BuildVec);
5963 Results.push_back(LoadChain);
5964}
5965
5968 SDValue Chain = N->getOperand(0);
5969 SDValue Intrin = N->getOperand(1);
5970 SDLoc DL(N);
5971
5972 // Get the intrinsic ID
5973 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5974 switch (IntrinNo) {
5975 default:
5976 return;
5977 case Intrinsic::nvvm_ldg_global_i:
5978 case Intrinsic::nvvm_ldg_global_f:
5979 case Intrinsic::nvvm_ldg_global_p:
5980 case Intrinsic::nvvm_ldu_global_i:
5981 case Intrinsic::nvvm_ldu_global_f:
5982 case Intrinsic::nvvm_ldu_global_p: {
5983 EVT ResVT = N->getValueType(0);
5984
5985 if (ResVT.isVector()) {
5986 // Vector LDG/LDU
5987
5988 unsigned NumElts = ResVT.getVectorNumElements();
5989 EVT EltVT = ResVT.getVectorElementType();
5990
5991 // Since LDU/LDG are target nodes, we cannot rely on DAG type
5992 // legalization.
5993 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5994 // loaded type to i16 and propagate the "real" type as the memory type.
5995 bool NeedTrunc = false;
5996 if (EltVT.getSizeInBits() < 16) {
5997 EltVT = MVT::i16;
5998 NeedTrunc = true;
5999 }
6000
6001 unsigned Opcode = 0;
6002 SDVTList LdResVTs;
6003
6004 switch (NumElts) {
6005 default:
6006 return;
6007 case 2:
6008 switch (IntrinNo) {
6009 default:
6010 return;
6011 case Intrinsic::nvvm_ldg_global_i:
6012 case Intrinsic::nvvm_ldg_global_f:
6013 case Intrinsic::nvvm_ldg_global_p:
6014 Opcode = NVPTXISD::LDGV2;
6015 break;
6016 case Intrinsic::nvvm_ldu_global_i:
6017 case Intrinsic::nvvm_ldu_global_f:
6018 case Intrinsic::nvvm_ldu_global_p:
6019 Opcode = NVPTXISD::LDUV2;
6020 break;
6021 }
6022 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6023 break;
6024 case 4: {
6025 switch (IntrinNo) {
6026 default:
6027 return;
6028 case Intrinsic::nvvm_ldg_global_i:
6029 case Intrinsic::nvvm_ldg_global_f:
6030 case Intrinsic::nvvm_ldg_global_p:
6031 Opcode = NVPTXISD::LDGV4;
6032 break;
6033 case Intrinsic::nvvm_ldu_global_i:
6034 case Intrinsic::nvvm_ldu_global_f:
6035 case Intrinsic::nvvm_ldu_global_p:
6036 Opcode = NVPTXISD::LDUV4;
6037 break;
6038 }
6039 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6040 LdResVTs = DAG.getVTList(ListVTs);
6041 break;
6042 }
6043 }
6044
6045 SmallVector<SDValue, 8> OtherOps;
6046
6047 // Copy regular operands
6048
6049 OtherOps.push_back(Chain); // Chain
6050 // Skip operand 1 (intrinsic ID)
6051 // Others
6052 OtherOps.append(N->op_begin() + 2, N->op_end());
6053
6054 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6055
6056 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6057 MemSD->getMemoryVT(),
6058 MemSD->getMemOperand());
6059
6060 SmallVector<SDValue, 4> ScalarRes;
6061
6062 for (unsigned i = 0; i < NumElts; ++i) {
6063 SDValue Res = NewLD.getValue(i);
6064 if (NeedTrunc)
6065 Res =
6066 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6067 ScalarRes.push_back(Res);
6068 }
6069
6070 SDValue LoadChain = NewLD.getValue(NumElts);
6071
6072 SDValue BuildVec =
6073 DAG.getBuildVector(ResVT, DL, ScalarRes);
6074
6075 Results.push_back(BuildVec);
6076 Results.push_back(LoadChain);
6077 } else {
6078 // i8 LDG/LDU
6079 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6080 "Custom handling of non-i8 ldu/ldg?");
6081
6082 // Just copy all operands as-is
6083 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
6084
6085 // Force output to i16
6086 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6087
6088 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6089
6090 // We make sure the memory type is i8, which will be used during isel
6091 // to select the proper instruction.
6092 SDValue NewLD =
6093 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6094 MVT::i8, MemSD->getMemOperand());
6095
6096 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6097 NewLD.getValue(0)));
6098 Results.push_back(NewLD.getValue(1));
6099 }
6100 }
6101 }
6102}
6103
6104void NVPTXTargetLowering::ReplaceNodeResults(
6106 switch (N->getOpcode()) {
6107 default:
6108 report_fatal_error("Unhandled custom legalization");
6109 case ISD::LOAD:
6111 return;
6114 return;
6115 }
6116}
6117
6120 Type *Ty = AI->getValOperand()->getType();
6121
6122 if (AI->isFloatingPointOperation()) {
6124 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6125 STI.getPTXVersion() >= 63)
6127 if (Ty->isFloatTy())
6129 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6131 }
6133 }
6134
6135 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6136 auto ITy = cast<llvm::IntegerType>(Ty);
6137
6138 switch (AI->getOperation()) {
6139 default:
6145 switch (ITy->getBitWidth()) {
6146 case 8:
6147 case 16:
6149 case 32:
6151 case 64:
6152 if (STI.hasAtomBitwise64())
6155 default:
6156 llvm_unreachable("unsupported width encountered");
6157 }
6164 switch (ITy->getBitWidth()) {
6165 case 8:
6166 case 16:
6168 case 32:
6170 case 64:
6171 if (STI.hasAtomMinMax64())
6174 default:
6175 llvm_unreachable("unsupported width encountered");
6176 }
6177 }
6178
6180}
6181
6182// Pin NVPTXTargetObjectFile's vtables to this file.
6184
6186 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6187 return getDataSection();
6188}
#define MAKE_CASE(V)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
This file contains the declarations of entities that describe floating point environment and related ...
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
Module.h This file contains the declarations for the Module class.
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static unsigned getOpcForSurfaceInstr(unsigned Intrinsic)
static bool Is16bitsType(MVT VT)
static bool IsTypePassedAsArray(const Type *Ty)
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static unsigned getOpcForTextureInstr(unsigned Intrinsic)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformStoreRetvalCombine(SDNode *N)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static bool adjustElementType(EVT &ElementType)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
ParamVectorizationFlags
@ PVF_FIRST
@ PVF_SCALAR
@ PVF_INNER
@ PVF_LAST
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
OperandSignedness
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
static std::atomic< unsigned > GlobalUniqueCallSite
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
unsigned SmVersion
Definition: NVVMReflect.cpp:81
#define P(N)
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:413
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1108
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:410
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ And
*p = old & v
Definition: Instructions.h:768
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
Value * getValOperand()
Definition: Instructions.h:914
MaybeAlign getParamAlignment(unsigned ArgNo) const
Return the alignment for the specified function parameter.
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Definition: Attributes.h:783
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1461
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1709
This class represents a function call, abstracting a target machine's calling convention.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:587
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:205
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:84
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:39
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
Machine Value Type.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
static auto fixedlen_vector_valuetypes()
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getPTXVersion() const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool hasBF16Math() const
bool allowFP16Math() const
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
bool useF32FTZ(const MachineFunction &MF) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
iterator_range< use_iterator > uses()
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition: SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
Class to represent struct types.
Definition: DerivedTypes.h:216
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
@ VoidTyID
type with no size
Definition: Type.h:63
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
StringRef save(const char *S)
Definition: StringSaver.h:52
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1155
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:269
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1188
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1077
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:327
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1052
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1056
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1151
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition: ISDOpcodes.h:359
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1041
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:922
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1182
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1208
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:279
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1146
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:313
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
@ Bitcast
Perform the operation on a different, but equivalently sized type.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
static bool isIndirectCall(const MachineInstr &MI)
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1689
bool Isv2x16VT(EVT VT)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2415
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
@ ADDRESS_SPACE_PARAM
Definition: NVPTXBaseInfo.h:29
bool getAlign(const Function &F, unsigned index, unsigned &align)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1937
unsigned promoteScalarArgumentSize(unsigned size)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)