LLVM 20.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
35#include "llvm/IR/Argument.h"
36#include "llvm/IR/Attributes.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
41#include "llvm/IR/FPEnv.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/GlobalValue.h"
44#include "llvm/IR/Instruction.h"
46#include "llvm/IR/IntrinsicsNVPTX.h"
47#include "llvm/IR/Module.h"
48#include "llvm/IR/Type.h"
49#include "llvm/IR/Value.h"
58#include <algorithm>
59#include <cassert>
60#include <cmath>
61#include <cstdint>
62#include <iterator>
63#include <optional>
64#include <sstream>
65#include <string>
66#include <utility>
67#include <vector>
68
69#define DEBUG_TYPE "nvptx-lower"
70
71using namespace llvm;
72
73static std::atomic<unsigned> GlobalUniqueCallSite;
74
76 "nvptx-sched4reg",
77 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
78
80 "nvptx-fma-level", cl::Hidden,
81 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
82 " 1: do it 2: do it aggressively"),
83 cl::init(2));
84
86 "nvptx-prec-divf32", cl::Hidden,
87 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
88 " IEEE Compliant F32 div.rnd if available."),
89 cl::init(2));
90
92 "nvptx-prec-sqrtf32", cl::Hidden,
93 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
94 cl::init(true));
95
97 "nvptx-force-min-byval-param-align", cl::Hidden,
98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
99 " params of device functions."),
100 cl::init(false));
101
103 if (UsePrecDivF32.getNumOccurrences() > 0) {
104 // If nvptx-prec-div32=N is used on the command-line, always honor it
105 return UsePrecDivF32;
106 } else {
107 // Otherwise, use div.approx if fast math is enabled
108 if (getTargetMachine().Options.UnsafeFPMath)
109 return 0;
110 else
111 return 2;
112 }
113}
114
116 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
118 return UsePrecSqrtF32;
119 } else {
120 // Otherwise, use sqrt.approx if fast math is enabled
122 }
123}
124
128}
129
130static bool IsPTXVectorType(MVT VT) {
131 switch (VT.SimpleTy) {
132 default:
133 return false;
134 case MVT::v2i1:
135 case MVT::v4i1:
136 case MVT::v2i8:
137 case MVT::v4i8:
138 case MVT::v2i16:
139 case MVT::v4i16:
140 case MVT::v8i16: // <4 x i16x2>
141 case MVT::v2i32:
142 case MVT::v4i32:
143 case MVT::v2i64:
144 case MVT::v2f16:
145 case MVT::v4f16:
146 case MVT::v8f16: // <4 x f16x2>
147 case MVT::v2bf16:
148 case MVT::v4bf16:
149 case MVT::v8bf16: // <4 x bf16x2>
150 case MVT::v2f32:
151 case MVT::v4f32:
152 case MVT::v2f64:
153 return true;
154 }
155}
156
157static bool Is16bitsType(MVT VT) {
158 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
159 VT.SimpleTy == MVT::i16);
160}
161
162/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
163/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
164/// into their primitive components.
165/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
166/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
167/// LowerCall, and LowerReturn.
168static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
169 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
170 SmallVectorImpl<uint64_t> *Offsets = nullptr,
171 uint64_t StartingOffset = 0) {
172 SmallVector<EVT, 16> TempVTs;
173 SmallVector<uint64_t, 16> TempOffsets;
174
175 // Special case for i128 - decompose to (i64, i64)
176 if (Ty->isIntegerTy(128)) {
177 ValueVTs.push_back(EVT(MVT::i64));
178 ValueVTs.push_back(EVT(MVT::i64));
179
180 if (Offsets) {
181 Offsets->push_back(StartingOffset + 0);
182 Offsets->push_back(StartingOffset + 8);
183 }
184
185 return;
186 }
187
188 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
189 if (StructType *STy = dyn_cast<StructType>(Ty)) {
190 auto const *SL = DL.getStructLayout(STy);
191 auto ElementNum = 0;
192 for(auto *EI : STy->elements()) {
193 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
194 StartingOffset + SL->getElementOffset(ElementNum));
195 ++ElementNum;
196 }
197 return;
198 }
199
200 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
201 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
202 EVT VT = TempVTs[i];
203 uint64_t Off = TempOffsets[i];
204 // Split vectors into individual elements, except for v2f16, which
205 // we will pass as a single scalar.
206 if (VT.isVector()) {
207 unsigned NumElts = VT.getVectorNumElements();
208 EVT EltVT = VT.getVectorElementType();
209 // Vectors with an even number of f16 elements will be passed to
210 // us as an array of v2f16/v2bf16 elements. We must match this so we
211 // stay in sync with Ins/Outs.
212 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
213 switch (EltVT.getSimpleVT().SimpleTy) {
214 case MVT::f16:
215 EltVT = MVT::v2f16;
216 break;
217 case MVT::bf16:
218 EltVT = MVT::v2bf16;
219 break;
220 case MVT::i16:
221 EltVT = MVT::v2i16;
222 break;
223 default:
224 llvm_unreachable("Unexpected type");
225 }
226 NumElts /= 2;
227 } else if (EltVT.getSimpleVT() == MVT::i8 &&
228 (NumElts % 4 == 0 || NumElts == 3)) {
229 // v*i8 are formally lowered as v4i8
230 EltVT = MVT::v4i8;
231 NumElts = (NumElts + 3) / 4;
232 }
233 for (unsigned j = 0; j != NumElts; ++j) {
234 ValueVTs.push_back(EltVT);
235 if (Offsets)
236 Offsets->push_back(Off + j * EltVT.getStoreSize());
237 }
238 } else {
239 ValueVTs.push_back(VT);
240 if (Offsets)
241 Offsets->push_back(Off);
242 }
243 }
244}
245
246/// PromoteScalarIntegerPTX
247/// Used to make sure the arguments/returns are suitable for passing
248/// and promote them to a larger size if they're not.
249///
250/// The promoted type is placed in \p PromoteVT if the function returns true.
251static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
252 if (VT.isScalarInteger()) {
253 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
254 default:
256 "Promotion is not suitable for scalars of size larger than 64-bits");
257 case 1:
258 *PromotedVT = MVT::i1;
259 break;
260 case 2:
261 case 4:
262 case 8:
263 *PromotedVT = MVT::i8;
264 break;
265 case 16:
266 *PromotedVT = MVT::i16;
267 break;
268 case 32:
269 *PromotedVT = MVT::i32;
270 break;
271 case 64:
272 *PromotedVT = MVT::i64;
273 break;
274 }
275 return EVT(*PromotedVT) != VT;
276 }
277 return false;
278}
279
280// Check whether we can merge loads/stores of some of the pieces of a
281// flattened function parameter or return value into a single vector
282// load/store.
283//
284// The flattened parameter is represented as a list of EVTs and
285// offsets, and the whole structure is aligned to ParamAlignment. This
286// function determines whether we can load/store pieces of the
287// parameter starting at index Idx using a single vectorized op of
288// size AccessSize. If so, it returns the number of param pieces
289// covered by the vector op. Otherwise, it returns 1.
291 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
292 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
293
294 // Can't vectorize if param alignment is not sufficient.
295 if (ParamAlignment < AccessSize)
296 return 1;
297 // Can't vectorize if offset is not aligned.
298 if (Offsets[Idx] & (AccessSize - 1))
299 return 1;
300
301 EVT EltVT = ValueVTs[Idx];
302 unsigned EltSize = EltVT.getStoreSize();
303
304 // Element is too large to vectorize.
305 if (EltSize >= AccessSize)
306 return 1;
307
308 unsigned NumElts = AccessSize / EltSize;
309 // Can't vectorize if AccessBytes if not a multiple of EltSize.
310 if (AccessSize != EltSize * NumElts)
311 return 1;
312
313 // We don't have enough elements to vectorize.
314 if (Idx + NumElts > ValueVTs.size())
315 return 1;
316
317 // PTX ISA can only deal with 2- and 4-element vector ops.
318 if (NumElts != 4 && NumElts != 2)
319 return 1;
320
321 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
322 // Types do not match.
323 if (ValueVTs[j] != EltVT)
324 return 1;
325
326 // Elements are not contiguous.
327 if (Offsets[j] - Offsets[j - 1] != EltSize)
328 return 1;
329 }
330 // OK. We can vectorize ValueVTs[i..i+NumElts)
331 return NumElts;
332}
333
334// Flags for tracking per-element vectorization state of loads/stores
335// of a flattened function parameter or return value.
337 PVF_INNER = 0x0, // Middle elements of a vector.
338 PVF_FIRST = 0x1, // First element of the vector.
339 PVF_LAST = 0x2, // Last element of the vector.
340 // Scalar is effectively a 1-element vector.
343
344// Computes whether and how we can vectorize the loads/stores of a
345// flattened function parameter or return value.
346//
347// The flattened parameter is represented as the list of ValueVTs and
348// Offsets, and is aligned to ParamAlignment bytes. We return a vector
349// of the same size as ValueVTs indicating how each piece should be
350// loaded/stored (i.e. as a scalar, or as part of a vector
351// load/store).
354 const SmallVectorImpl<uint64_t> &Offsets,
355 Align ParamAlignment, bool IsVAArg = false) {
356 // Set vector size to match ValueVTs and mark all elements as
357 // scalars by default.
359 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
360
361 if (IsVAArg)
362 return VectorInfo;
363
364 // Check what we can vectorize using 128/64/32-bit accesses.
365 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
366 // Skip elements we've already processed.
367 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
368 for (unsigned AccessSize : {16, 8, 4, 2}) {
369 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
370 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
371 // Mark vectorized elements.
372 switch (NumElts) {
373 default:
374 llvm_unreachable("Unexpected return value");
375 case 1:
376 // Can't vectorize using this size, try next smaller size.
377 continue;
378 case 2:
379 assert(I + 1 < E && "Not enough elements.");
380 VectorInfo[I] = PVF_FIRST;
381 VectorInfo[I + 1] = PVF_LAST;
382 I += 1;
383 break;
384 case 4:
385 assert(I + 3 < E && "Not enough elements.");
386 VectorInfo[I] = PVF_FIRST;
387 VectorInfo[I + 1] = PVF_INNER;
388 VectorInfo[I + 2] = PVF_INNER;
389 VectorInfo[I + 3] = PVF_LAST;
390 I += 3;
391 break;
392 }
393 // Break out of the inner loop because we've already succeeded
394 // using largest possible AccessSize.
395 break;
396 }
397 }
398 return VectorInfo;
399}
400
401// NVPTXTargetLowering Constructor.
403 const NVPTXSubtarget &STI)
404 : TargetLowering(TM), nvTM(&TM), STI(STI) {
405 // always lower memset, memcpy, and memmove intrinsics to load/store
406 // instructions, rather
407 // then generating calls to memset, mempcy or memmove.
411
414
415 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
416 // condition branches.
417 setJumpIsExpensive(true);
418
419 // Wide divides are _very_ slow. Try to reduce the width of the divide if
420 // possible.
421 addBypassSlowDiv(64, 32);
422
423 // By default, use the Source scheduling
424 if (sched4reg)
426 else
428
429 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
430 LegalizeAction NoF16Action) {
431 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
432 };
433
434 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
435 LegalizeAction NoBF16Action) {
436 bool IsOpSupported = STI.hasBF16Math();
437 // Few instructions are available on sm_90 only
438 switch(Op) {
439 case ISD::FADD:
440 case ISD::FMUL:
441 case ISD::FSUB:
442 case ISD::SELECT:
443 case ISD::SELECT_CC:
444 case ISD::SETCC:
445 case ISD::FEXP2:
446 case ISD::FCEIL:
447 case ISD::FFLOOR:
448 case ISD::FNEARBYINT:
449 case ISD::FRINT:
450 case ISD::FROUNDEVEN:
451 case ISD::FTRUNC:
452 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
453 break;
454 }
456 Op, VT, IsOpSupported ? Action : NoBF16Action);
457 };
458
459 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
460 LegalizeAction NoI16x2Action) {
461 bool IsOpSupported = false;
462 // instructions are available on sm_90 only
463 switch (Op) {
464 case ISD::ADD:
465 case ISD::SMAX:
466 case ISD::SMIN:
467 case ISD::UMIN:
468 case ISD::UMAX:
469 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
470 break;
471 }
472 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
473 };
474
475 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
476 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
477 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
478 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
479 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
480 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
481 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
482 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
483 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
484 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
485 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
486 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
487
488 // Conversion to/from FP16/FP16x2 is always legal.
493
495 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
497
498 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
499 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
500
501 // Conversion to/from BFP16/BFP16x2 is always legal.
506
507 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
508 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
509 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
510 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
511
512 // Conversion to/from i16/i16x2 is always legal.
517
522 // Only logical ops can be done on v4i8 directly, others must be done
523 // elementwise.
540 MVT::v4i8, Expand);
541
542 // Operations not directly supported by NVPTX.
543 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
544 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
545 MVT::i32, MVT::i64}) {
548 }
549
550 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
551 // For others we will expand to a SHL/SRA pair.
558
565
568
569 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
570 // that don't have h/w rotation we lower them to multi-instruction assembly.
571 // See ROT*_sw in NVPTXIntrInfo.td
576
578 setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
580 setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
584
585 // Indirect branch is not supported.
586 // This also disables Jump Table creation.
589
592
593 // We want to legalize constant related memmove and memcopy
594 // intrinsics.
596
597 // Turn FP extload into load/fpextend
598 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
599 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
600 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
601 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
602 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
605 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
606 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
607 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
610 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
611 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
612 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
614 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
615 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
616 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
617 // Turn FP truncstore into trunc + store.
618 // FIXME: vector types should also be expanded
619 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
620 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
621 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
622 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
623 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
624
625 // PTX does not support load / store predicate registers
628
629 for (MVT VT : MVT::integer_valuetypes()) {
633 setTruncStoreAction(VT, MVT::i1, Expand);
634 }
635
636 // expand extload of vector of integers.
638 MVT::v2i8, Expand);
639 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
640
641 // This is legal in NVPTX
646
649
650 // TRAP can be lowered to PTX trap
651 setOperationAction(ISD::TRAP, MVT::Other, Legal);
652
653 // Register custom handling for vector loads/stores
655 if (IsPTXVectorType(VT)) {
659 }
660 }
661
662 // Support varargs.
667
668 // Custom handling for i8 intrinsics
670
671 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
677
680 }
681
682 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
683 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
684 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
685 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
686 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
687 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
688 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
689
690 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
691 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
692 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
693 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
694 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
695 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
696
697 // Other arithmetic and logic ops are unsupported.
701 MVT::v2i16, Expand);
702
707 if (STI.getPTXVersion() >= 43) {
712 }
713
715 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
718
719 // PTX does not directly support SELP of i1, so promote to i32 first
721
722 // PTX cannot multiply two i64s in a single instruction.
725
726 // We have some custom DAG combine patterns for these nodes
729 ISD::VSELECT});
730
731 // setcc for f16x2 and bf16x2 needs special handling to prevent
732 // legalizer's attempt to scalarize it due to v2i1 not being legal.
733 if (STI.allowFP16Math() || STI.hasBF16Math())
735
736 // Promote fp16 arithmetic if fp16 hardware isn't available or the
737 // user passed --nvptx-no-fp16-math. The flag is useful because,
738 // although sm_53+ GPUs have some sort of FP16 support in
739 // hardware, only sm_53 and sm_60 have full implementation. Others
740 // only have token amount of hardware and are likely to run faster
741 // by using fp32 units instead.
742 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
743 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
744 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
745 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
746 // bf16 must be promoted to f32.
747 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
748 if (getOperationAction(Op, MVT::bf16) == Promote)
749 AddPromotedToType(Op, MVT::bf16, MVT::f32);
750 }
751
752 // f16/f16x2 neg was introduced in PTX 60, SM_53.
753 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
754 STI.getPTXVersion() >= 60 &&
755 STI.allowFP16Math();
756 for (const auto &VT : {MVT::f16, MVT::v2f16})
758 IsFP16FP16x2NegAvailable ? Legal : Expand);
759
760 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
761 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
762 // (would be) Library functions.
763
764 // These map to conversion instructions for scalar FP types.
765 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
767 setOperationAction(Op, MVT::f16, Legal);
768 setOperationAction(Op, MVT::f32, Legal);
769 setOperationAction(Op, MVT::f64, Legal);
770 setOperationAction(Op, MVT::v2f16, Expand);
771 setOperationAction(Op, MVT::v2bf16, Expand);
772 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
773 if (getOperationAction(Op, MVT::bf16) == Promote)
774 AddPromotedToType(Op, MVT::bf16, MVT::f32);
775 }
776
777 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
779 }
780 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
781 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
784 }
785 }
786
787 // sm_80 only has conversions between f32 and bf16. Custom lower all other
788 // bf16 conversions.
789 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
790 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
793 VT, Custom);
794 }
797 MVT::bf16, Custom);
798 }
799
806 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
807
808 // 'Expand' implements FCOPYSIGN without calling an external library.
815
816 // These map to corresponding instructions for f32/f64. f16 must be
817 // promoted to f32. v2f16 is expanded to f16, which is then promoted
818 // to f32.
819 for (const auto &Op :
821 setOperationAction(Op, MVT::f16, Promote);
822 setOperationAction(Op, MVT::f32, Legal);
823 setOperationAction(Op, MVT::f64, Legal);
824 setOperationAction(Op, MVT::v2f16, Expand);
825 setOperationAction(Op, MVT::v2bf16, Expand);
826 setOperationAction(Op, MVT::bf16, Promote);
827 AddPromotedToType(Op, MVT::bf16, MVT::f32);
828 }
829 for (const auto &Op : {ISD::FABS}) {
830 setOperationAction(Op, MVT::f16, Promote);
831 setOperationAction(Op, MVT::f32, Legal);
832 setOperationAction(Op, MVT::f64, Legal);
833 setOperationAction(Op, MVT::v2f16, Expand);
834 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
835 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
836 if (getOperationAction(Op, MVT::bf16) == Promote)
837 AddPromotedToType(Op, MVT::bf16, MVT::f32);
838 }
839
840 // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
841 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
842 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
843 return IsAtLeastSm80 ? Legal : NotSm80Action;
844 };
845 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
846 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
847 setOperationAction(Op, MVT::f32, Legal);
848 setOperationAction(Op, MVT::f64, Legal);
849 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
850 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
851 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
852 if (getOperationAction(Op, MVT::bf16) == Promote)
853 AddPromotedToType(Op, MVT::bf16, MVT::f32);
854 }
855 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
856 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
857 setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
858 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
859 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
860 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
861 }
862
863 // Custom lowering for inline asm with 128-bit operands
866
867 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
868 // No FPOW or FREM in PTX.
869
870 // Now deduce the information based on the above mentioned
871 // actions
873
877}
878
879const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
880
881#define MAKE_CASE(V) \
882 case V: \
883 return #V;
884
885 switch ((NVPTXISD::NodeType)Opcode) {
887 break;
888
1032
1123
1135
1147
1159
1171
1183
1195
1207
1219
1231
1243
1255
1267
1279
1291
1303 }
1304 return nullptr;
1305
1306#undef MAKE_CASE
1307}
1308
1311 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1312 VT.getScalarType() == MVT::i1)
1313 return TypeSplitVector;
1314 if (Isv2x16VT(VT))
1315 return TypeLegal;
1317}
1318
1320 int Enabled, int &ExtraSteps,
1321 bool &UseOneConst,
1322 bool Reciprocal) const {
1325 return SDValue();
1326
1327 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1328 ExtraSteps = 0;
1329
1330 SDLoc DL(Operand);
1331 EVT VT = Operand.getValueType();
1332 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1333
1334 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1335 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1336 DAG.getConstant(IID, DL, MVT::i32), Operand);
1337 };
1338
1339 // The sqrt and rsqrt refinement processes assume we always start out with an
1340 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1341 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1342 // any refinement, we must return a regular sqrt.
1343 if (Reciprocal || ExtraSteps > 0) {
1344 if (VT == MVT::f32)
1345 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1346 : Intrinsic::nvvm_rsqrt_approx_f);
1347 else if (VT == MVT::f64)
1348 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1349 else
1350 return SDValue();
1351 } else {
1352 if (VT == MVT::f32)
1353 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1354 : Intrinsic::nvvm_sqrt_approx_f);
1355 else {
1356 // There's no sqrt.approx.f64 instruction, so we emit
1357 // reciprocal(rsqrt(x)). This is faster than
1358 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1359 // x * rsqrt(x).)
1360 return DAG.getNode(
1362 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1363 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1364 }
1365 }
1366}
1367
1368SDValue
1370 SDLoc dl(Op);
1371 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1372 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1373 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1374 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1375}
1376
1377static bool IsTypePassedAsArray(const Type *Ty) {
1378 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1379 Ty->isHalfTy() || Ty->isBFloatTy();
1380}
1381
1383 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1384 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1385 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1386 const CallBase &CB, unsigned UniqueCallSite) const {
1387 auto PtrVT = getPointerTy(DL);
1388
1389 bool isABI = (STI.getSmVersion() >= 20);
1390 assert(isABI && "Non-ABI compilation is not supported");
1391 if (!isABI)
1392 return "";
1393
1394 std::string Prototype;
1395 raw_string_ostream O(Prototype);
1396 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1397
1398 if (retTy->getTypeID() == Type::VoidTyID) {
1399 O << "()";
1400 } else {
1401 O << "(";
1402 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1403 !IsTypePassedAsArray(retTy)) {
1404 unsigned size = 0;
1405 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1406 size = ITy->getBitWidth();
1407 } else {
1408 assert(retTy->isFloatingPointTy() &&
1409 "Floating point type expected here");
1410 size = retTy->getPrimitiveSizeInBits();
1411 }
1412 // PTX ABI requires all scalar return values to be at least 32
1413 // bits in size. fp16 normally uses .b16 as its storage type in
1414 // PTX, so its size must be adjusted here, too.
1416
1417 O << ".param .b" << size << " _";
1418 } else if (isa<PointerType>(retTy)) {
1419 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1420 } else if (IsTypePassedAsArray(retTy)) {
1421 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1422 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1423 } else {
1424 llvm_unreachable("Unknown return type");
1425 }
1426 O << ") ";
1427 }
1428 O << "_ (";
1429
1430 bool first = true;
1431
1432 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1433 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1434 Type *Ty = Args[i].Ty;
1435 if (!first) {
1436 O << ", ";
1437 }
1438 first = false;
1439
1440 if (!Outs[OIdx].Flags.isByVal()) {
1441 if (IsTypePassedAsArray(Ty)) {
1442 Align ParamAlign =
1443 getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL);
1444 O << ".param .align " << ParamAlign.value() << " .b8 ";
1445 O << "_";
1446 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1447 // update the index for Outs
1448 SmallVector<EVT, 16> vtparts;
1449 ComputeValueVTs(*this, DL, Ty, vtparts);
1450 if (unsigned len = vtparts.size())
1451 OIdx += len - 1;
1452 continue;
1453 }
1454 // i8 types in IR will be i16 types in SDAG
1455 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1456 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1457 "type mismatch between callee prototype and arguments");
1458 // scalar type
1459 unsigned sz = 0;
1460 if (isa<IntegerType>(Ty)) {
1461 sz = cast<IntegerType>(Ty)->getBitWidth();
1463 } else if (isa<PointerType>(Ty)) {
1464 sz = PtrVT.getSizeInBits();
1465 } else {
1466 sz = Ty->getPrimitiveSizeInBits();
1467 }
1468 O << ".param .b" << sz << " ";
1469 O << "_";
1470 continue;
1471 }
1472
1473 // Indirect calls need strict ABI alignment so we disable optimizations by
1474 // not providing a function to optimize.
1475 Type *ETy = Args[i].IndirectType;
1476 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1477 Align ParamByValAlign =
1478 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1479
1480 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1481 O << "_";
1482 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1483 }
1484
1485 if (VAInfo)
1486 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1487 << " .b8 _[]\n";
1488 O << ")";
1490 O << " .noreturn";
1491 O << ";";
1492
1493 return Prototype;
1494}
1495
1497 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1498 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1499}
1500
1501Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1502 unsigned Idx,
1503 const DataLayout &DL) const {
1504 if (!CB) {
1505 // CallSite is zero, fallback to ABI type alignment
1506 return DL.getABITypeAlign(Ty);
1507 }
1508
1509 const Function *DirectCallee = CB->getCalledFunction();
1510
1511 if (!DirectCallee) {
1512 // We don't have a direct function symbol, but that may be because of
1513 // constant cast instructions in the call.
1514
1515 // With bitcast'd call targets, the instruction will be the call
1516 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1517 // Check if we have call alignment metadata
1518 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1519 return StackAlign.value();
1520 }
1521 DirectCallee = getMaybeBitcastedCallee(CB);
1522 }
1523
1524 // Check for function alignment information if we found that the
1525 // ultimate target is a Function
1526 if (DirectCallee)
1527 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1528
1529 // Call is indirect, fall back to the ABI type alignment
1530 return DL.getABITypeAlign(Ty);
1531}
1532
1533static bool adjustElementType(EVT &ElementType) {
1534 switch (ElementType.getSimpleVT().SimpleTy) {
1535 default:
1536 return false;
1537 case MVT::f16:
1538 case MVT::bf16:
1539 ElementType = MVT::i16;
1540 return true;
1541 case MVT::f32:
1542 case MVT::v2f16:
1543 case MVT::v2bf16:
1544 ElementType = MVT::i32;
1545 return true;
1546 case MVT::f64:
1547 ElementType = MVT::i64;
1548 return true;
1549 }
1550}
1551
1552// Use byte-store when the param address of the argument value is unaligned.
1553// This may happen when the return value is a field of a packed structure.
1554//
1555// This is called in LowerCall() when passing the param values.
1557 uint64_t Offset, EVT ElementType,
1558 SDValue StVal, SDValue &InGlue,
1559 unsigned ArgID, const SDLoc &dl) {
1560 // Bit logic only works on integer types
1561 if (adjustElementType(ElementType))
1562 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1563
1564 // Store each byte
1565 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1566 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1567 // Shift the byte to the last byte position
1568 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1569 DAG.getConstant(i * 8, dl, MVT::i32));
1570 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1571 DAG.getConstant(Offset + i, dl, MVT::i32),
1572 ShiftVal, InGlue};
1573 // Trunc store only the last byte by using
1574 // st.param.b8
1575 // The register type can be larger than b8.
1576 Chain = DAG.getMemIntrinsicNode(
1577 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1579 InGlue = Chain.getValue(1);
1580 }
1581 return Chain;
1582}
1583
1584// Use byte-load when the param adress of the returned value is unaligned.
1585// This may happen when the returned value is a field of a packed structure.
1586static SDValue
1588 EVT ElementType, SDValue &InGlue,
1589 SmallVectorImpl<SDValue> &TempProxyRegOps,
1590 const SDLoc &dl) {
1591 // Bit logic only works on integer types
1592 EVT MergedType = ElementType;
1593 adjustElementType(MergedType);
1594
1595 // Load each byte and construct the whole value. Initial value to 0
1596 SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1597 // LoadParamMemI8 loads into i16 register only
1598 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1599 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1600 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1601 DAG.getConstant(Offset + i, dl, MVT::i32),
1602 InGlue};
1603 // This will be selected to LoadParamMemI8
1604 SDValue LdVal =
1605 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1606 MVT::i8, MachinePointerInfo(), Align(1));
1607 SDValue TmpLdVal = LdVal.getValue(0);
1608 Chain = LdVal.getValue(1);
1609 InGlue = LdVal.getValue(2);
1610
1611 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1612 TmpLdVal.getSimpleValueType(), TmpLdVal);
1613 TempProxyRegOps.push_back(TmpLdVal);
1614
1615 SDValue CMask = DAG.getConstant(255, dl, MergedType);
1616 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1617 // Need to extend the i16 register to the whole width.
1618 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1619 // Mask off the high bits. Leave only the lower 8bits.
1620 // Do this because we are using loadparam.b8.
1621 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1622 // Shift and merge
1623 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1624 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1625 }
1626 if (ElementType != MergedType)
1627 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1628
1629 return RetVal;
1630}
1631
1633 SmallVectorImpl<SDValue> &InVals) const {
1634
1635 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1637 "Support for variadic functions (unsized array parameter) introduced "
1638 "in PTX ISA version 6.0 and requires target sm_30.");
1639
1640 SelectionDAG &DAG = CLI.DAG;
1641 SDLoc dl = CLI.DL;
1643 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1645 SDValue Chain = CLI.Chain;
1646 SDValue Callee = CLI.Callee;
1647 bool &isTailCall = CLI.IsTailCall;
1648 ArgListTy &Args = CLI.getArgs();
1649 Type *RetTy = CLI.RetTy;
1650 const CallBase *CB = CLI.CB;
1651 const DataLayout &DL = DAG.getDataLayout();
1652
1653 bool isABI = (STI.getSmVersion() >= 20);
1654 assert(isABI && "Non-ABI compilation is not supported");
1655 if (!isABI)
1656 return Chain;
1657
1658 // Variadic arguments.
1659 //
1660 // Normally, for each argument, we declare a param scalar or a param
1661 // byte array in the .param space, and store the argument value to that
1662 // param scalar or array starting at offset 0.
1663 //
1664 // In the case of the first variadic argument, we declare a vararg byte array
1665 // with size 0. The exact size of this array isn't known at this point, so
1666 // it'll be patched later. All the variadic arguments will be stored to this
1667 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1668 // initially set to 0, so it can be used for non-variadic arguments (which use
1669 // 0 offset) to simplify the code.
1670 //
1671 // After all vararg is processed, 'VAOffset' holds the size of the
1672 // vararg byte array.
1673
1674 SDValue VADeclareParam; // vararg byte array
1675 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1676 unsigned VAOffset = 0; // current offset in the param array
1677
1678 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1679 SDValue TempChain = Chain;
1680 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1681 SDValue InGlue = Chain.getValue(1);
1682
1683 unsigned ParamCount = 0;
1684 // Args.size() and Outs.size() need not match.
1685 // Outs.size() will be larger
1686 // * if there is an aggregate argument with multiple fields (each field
1687 // showing up separately in Outs)
1688 // * if there is a vector argument with more than typical vector-length
1689 // elements (generally if more than 4) where each vector element is
1690 // individually present in Outs.
1691 // So a different index should be used for indexing into Outs/OutVals.
1692 // See similar issue in LowerFormalArguments.
1693 unsigned OIdx = 0;
1694 // Declare the .params or .reg need to pass values
1695 // to the function
1696 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1697 EVT VT = Outs[OIdx].VT;
1698 Type *Ty = Args[i].Ty;
1699 bool IsVAArg = (i >= CLI.NumFixedArgs);
1700 bool IsByVal = Outs[OIdx].Flags.isByVal();
1701
1704
1705 assert((!IsByVal || Args[i].IndirectType) &&
1706 "byval arg must have indirect type");
1707 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1708 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1709
1710 Align ArgAlign;
1711 if (IsByVal) {
1712 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1713 // so we don't need to worry whether it's naturally aligned or not.
1714 // See TargetLowering::LowerCallTo().
1715 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1716 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1717 InitialAlign, DL);
1718 if (IsVAArg)
1719 VAOffset = alignTo(VAOffset, ArgAlign);
1720 } else {
1721 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1722 }
1723
1724 unsigned TypeSize =
1725 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1726 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1727
1728 bool NeedAlign; // Does argument declaration specify alignment?
1729 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1730 if (IsVAArg) {
1731 if (ParamCount == FirstVAArg) {
1732 SDValue DeclareParamOps[] = {
1733 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1734 DAG.getConstant(ParamCount, dl, MVT::i32),
1735 DAG.getConstant(1, dl, MVT::i32), InGlue};
1736 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1737 DeclareParamVTs, DeclareParamOps);
1738 }
1739 NeedAlign = PassAsArray;
1740 } else if (PassAsArray) {
1741 // declare .param .align <align> .b8 .param<n>[<size>];
1742 SDValue DeclareParamOps[] = {
1743 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1744 DAG.getConstant(ParamCount, dl, MVT::i32),
1745 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1746 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1747 DeclareParamOps);
1748 NeedAlign = true;
1749 } else {
1750 // declare .param .b<size> .param<n>;
1751 if (VT.isInteger() || VT.isFloatingPoint()) {
1752 // PTX ABI requires integral types to be at least 32 bits in
1753 // size. FP16 is loaded/stored using i16, so it's handled
1754 // here as well.
1756 }
1757 SDValue DeclareScalarParamOps[] = {
1758 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1759 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1760 DAG.getConstant(0, dl, MVT::i32), InGlue};
1761 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1762 DeclareScalarParamOps);
1763 NeedAlign = false;
1764 }
1765 InGlue = Chain.getValue(1);
1766
1767 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1768 // than 32-bits are sign extended or zero extended, depending on
1769 // whether they are signed or unsigned types. This case applies
1770 // only to scalar parameters and not to aggregate values.
1771 bool ExtendIntegerParam =
1772 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1773
1774 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1775 SmallVector<SDValue, 6> StoreOperands;
1776 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1777 EVT EltVT = VTs[j];
1778 int CurOffset = Offsets[j];
1779 MaybeAlign PartAlign;
1780 if (NeedAlign)
1781 PartAlign = commonAlignment(ArgAlign, CurOffset);
1782
1783 SDValue StVal = OutVals[OIdx];
1784
1785 MVT PromotedVT;
1786 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1787 EltVT = EVT(PromotedVT);
1788 }
1789 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1791 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1792 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1793 }
1794
1795 if (IsByVal) {
1796 auto PtrVT = getPointerTy(DL);
1797 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1798 DAG.getConstant(CurOffset, dl, PtrVT));
1799 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1800 PartAlign);
1801 } else if (ExtendIntegerParam) {
1802 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1803 // zext/sext to i32
1804 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1806 dl, MVT::i32, StVal);
1807 }
1808
1809 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1810 // Use 16-bit registers for small stores as it's the
1811 // smallest general purpose register size supported by NVPTX.
1812 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1813 }
1814
1815 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1816 // scalar store. In such cases, fall back to byte stores.
1817 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1818 PartAlign.value() <
1819 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1820 assert(StoreOperands.empty() && "Unfinished preceeding store.");
1822 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1823 StVal, InGlue, ParamCount, dl);
1824
1825 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1826 // into the SDAG, so just move on to the next element.
1827 if (!IsByVal)
1828 ++OIdx;
1829 continue;
1830 }
1831
1832 // New store.
1833 if (VectorInfo[j] & PVF_FIRST) {
1834 assert(StoreOperands.empty() && "Unfinished preceding store.");
1835 StoreOperands.push_back(Chain);
1836 StoreOperands.push_back(
1837 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1838
1839 StoreOperands.push_back(DAG.getConstant(
1840 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1841 dl, MVT::i32));
1842 }
1843
1844 // Record the value to store.
1845 StoreOperands.push_back(StVal);
1846
1847 if (VectorInfo[j] & PVF_LAST) {
1848 unsigned NumElts = StoreOperands.size() - 3;
1850 switch (NumElts) {
1851 case 1:
1853 break;
1854 case 2:
1856 break;
1857 case 4:
1859 break;
1860 default:
1861 llvm_unreachable("Invalid vector info.");
1862 }
1863
1864 StoreOperands.push_back(InGlue);
1865
1866 // Adjust type of the store op if we've extended the scalar
1867 // return value.
1868 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1869
1870 Chain = DAG.getMemIntrinsicNode(
1871 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1872 TheStoreType, MachinePointerInfo(), PartAlign,
1874 InGlue = Chain.getValue(1);
1875
1876 // Cleanup.
1877 StoreOperands.clear();
1878
1879 // TODO: We may need to support vector types that can be passed
1880 // as scalars in variadic arguments.
1881 if (!IsByVal && IsVAArg) {
1882 assert(NumElts == 1 &&
1883 "Vectorization is expected to be disabled for variadics.");
1884 VAOffset += DL.getTypeAllocSize(
1885 TheStoreType.getTypeForEVT(*DAG.getContext()));
1886 }
1887 }
1888 if (!IsByVal)
1889 ++OIdx;
1890 }
1891 assert(StoreOperands.empty() && "Unfinished parameter store.");
1892 if (!IsByVal && VTs.size() > 0)
1893 --OIdx;
1894 ++ParamCount;
1895 if (IsByVal && IsVAArg)
1896 VAOffset += TypeSize;
1897 }
1898
1899 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1900 MaybeAlign retAlignment = std::nullopt;
1901
1902 // Handle Result
1903 if (Ins.size() > 0) {
1904 SmallVector<EVT, 16> resvtparts;
1905 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1906
1907 // Declare
1908 // .param .align N .b8 retval0[<size-in-bytes>], or
1909 // .param .b<size-in-bits> retval0
1910 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1911 if (!IsTypePassedAsArray(RetTy)) {
1912 resultsz = promoteScalarArgumentSize(resultsz);
1913 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1914 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1915 DAG.getConstant(resultsz, dl, MVT::i32),
1916 DAG.getConstant(0, dl, MVT::i32), InGlue };
1917 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1918 DeclareRetOps);
1919 InGlue = Chain.getValue(1);
1920 } else {
1921 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1922 assert(retAlignment && "retAlignment is guaranteed to be set");
1923 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1924 SDValue DeclareRetOps[] = {
1925 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1926 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1927 DAG.getConstant(0, dl, MVT::i32), InGlue};
1928 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1929 DeclareRetOps);
1930 InGlue = Chain.getValue(1);
1931 }
1932 }
1933
1934 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1935 // Set the size of the vararg param byte array if the callee is a variadic
1936 // function and the variadic part is not empty.
1937 if (HasVAArgs) {
1938 SDValue DeclareParamOps[] = {
1939 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1940 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1941 VADeclareParam.getOperand(4)};
1942 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1943 VADeclareParam->getVTList(), DeclareParamOps);
1944 }
1945
1946 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1947 // between them we must rely on the call site value which is valid for
1948 // indirect calls but is always null for libcalls.
1949 bool isIndirectCall = !Func && CB;
1950
1951 if (isa<ExternalSymbolSDNode>(Callee)) {
1952 Function* CalleeFunc = nullptr;
1953
1954 // Try to find the callee in the current module.
1955 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1956 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1957
1958 // Set the "libcall callee" attribute to indicate that the function
1959 // must always have a declaration.
1960 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1961 }
1962
1963 if (isIndirectCall) {
1964 // This is indirect function call case : PTX requires a prototype of the
1965 // form
1966 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1967 // to be emitted, and the label has to used as the last arg of call
1968 // instruction.
1969 // The prototype is embedded in a string and put as the operand for a
1970 // CallPrototype SDNode which will print out to the value of the string.
1971 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1972 std::string Proto = getPrototype(
1973 DL, RetTy, Args, Outs, retAlignment,
1974 HasVAArgs
1975 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1976 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1977 : std::nullopt,
1978 *CB, UniqueCallSite);
1979 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1980 SDValue ProtoOps[] = {
1981 Chain,
1982 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1983 InGlue,
1984 };
1985 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1986 InGlue = Chain.getValue(1);
1987 }
1988 // Op to just print "call"
1989 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1990 SDValue PrintCallOps[] = {
1991 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1992 };
1993 // We model convergent calls as separate opcodes.
1995 if (CLI.IsConvergent)
1998 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1999 InGlue = Chain.getValue(1);
2000
2001 // Ops to print out the function name
2002 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2003 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2004 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2005 InGlue = Chain.getValue(1);
2006
2007 // Ops to print out the param list
2008 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2009 SDValue CallArgBeginOps[] = { Chain, InGlue };
2010 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2011 CallArgBeginOps);
2012 InGlue = Chain.getValue(1);
2013
2014 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2015 ++i) {
2016 unsigned opcode;
2017 if (i == (e - 1))
2018 opcode = NVPTXISD::LastCallArg;
2019 else
2020 opcode = NVPTXISD::CallArg;
2021 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2022 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2023 DAG.getConstant(i, dl, MVT::i32), InGlue };
2024 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2025 InGlue = Chain.getValue(1);
2026 }
2027 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2028 SDValue CallArgEndOps[] = { Chain,
2029 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2030 InGlue };
2031 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2032 InGlue = Chain.getValue(1);
2033
2034 if (isIndirectCall) {
2035 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2036 SDValue PrototypeOps[] = {
2037 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2038 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2039 InGlue = Chain.getValue(1);
2040 }
2041
2042 SmallVector<SDValue, 16> ProxyRegOps;
2043 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2044 // An item of the vector is filled if the element does not need a ProxyReg
2045 // operation on it and should be added to InVals as is. ProxyRegOps and
2046 // ProxyRegTruncates contain empty/none items at the same index.
2048 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2049 // to use the values of `LoadParam`s and to be replaced later then
2050 // `CALLSEQ_END` is added.
2051 SmallVector<SDValue, 16> TempProxyRegOps;
2052
2053 // Generate loads from param memory/moves from registers for result
2054 if (Ins.size() > 0) {
2057 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2058 assert(VTs.size() == Ins.size() && "Bad value decomposition");
2059
2060 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2061 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2062
2063 SmallVector<EVT, 6> LoadVTs;
2064 int VecIdx = -1; // Index of the first element of the vector.
2065
2066 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2067 // 32-bits are sign extended or zero extended, depending on whether
2068 // they are signed or unsigned types.
2069 bool ExtendIntegerRetVal =
2070 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2071
2072 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2073 bool needTruncate = false;
2074 EVT TheLoadType = VTs[i];
2075 EVT EltType = Ins[i].VT;
2076 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2077 MVT PromotedVT;
2078
2079 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2080 TheLoadType = EVT(PromotedVT);
2081 EltType = EVT(PromotedVT);
2082 needTruncate = true;
2083 }
2084
2085 if (ExtendIntegerRetVal) {
2086 TheLoadType = MVT::i32;
2087 EltType = MVT::i32;
2088 needTruncate = true;
2089 } else if (TheLoadType.getSizeInBits() < 16) {
2090 if (VTs[i].isInteger())
2091 needTruncate = true;
2092 EltType = MVT::i16;
2093 }
2094
2095 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2096 // scalar load. In such cases, fall back to byte loads.
2097 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
2098 EltAlign < DL.getABITypeAlign(
2099 TheLoadType.getTypeForEVT(*DAG.getContext()))) {
2100 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2102 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
2103 ProxyRegOps.push_back(SDValue());
2104 ProxyRegTruncates.push_back(std::optional<MVT>());
2105 RetElts.resize(i);
2106 RetElts.push_back(Ret);
2107
2108 continue;
2109 }
2110
2111 // Record index of the very first element of the vector.
2112 if (VectorInfo[i] & PVF_FIRST) {
2113 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2114 VecIdx = i;
2115 }
2116
2117 LoadVTs.push_back(EltType);
2118
2119 if (VectorInfo[i] & PVF_LAST) {
2120 unsigned NumElts = LoadVTs.size();
2121 LoadVTs.push_back(MVT::Other);
2122 LoadVTs.push_back(MVT::Glue);
2124 switch (NumElts) {
2125 case 1:
2127 break;
2128 case 2:
2130 break;
2131 case 4:
2133 break;
2134 default:
2135 llvm_unreachable("Invalid vector info.");
2136 }
2137
2138 SDValue LoadOperands[] = {
2139 Chain, DAG.getConstant(1, dl, MVT::i32),
2140 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2141 SDValue RetVal = DAG.getMemIntrinsicNode(
2142 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2143 MachinePointerInfo(), EltAlign,
2145
2146 for (unsigned j = 0; j < NumElts; ++j) {
2147 ProxyRegOps.push_back(RetVal.getValue(j));
2148
2149 if (needTruncate)
2150 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2151 else
2152 ProxyRegTruncates.push_back(std::optional<MVT>());
2153 }
2154
2155 Chain = RetVal.getValue(NumElts);
2156 InGlue = RetVal.getValue(NumElts + 1);
2157
2158 // Cleanup
2159 VecIdx = -1;
2160 LoadVTs.clear();
2161 }
2162 }
2163 }
2164
2165 Chain =
2166 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2167 InGlue = Chain.getValue(1);
2168
2169 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2170 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2171 // dangling.
2172 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2173 if (i < RetElts.size() && RetElts[i]) {
2174 InVals.push_back(RetElts[i]);
2175 continue;
2176 }
2177
2178 SDValue Ret = DAG.getNode(
2180 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2181 { Chain, ProxyRegOps[i], InGlue }
2182 );
2183
2184 Chain = Ret.getValue(1);
2185 InGlue = Ret.getValue(2);
2186
2187 if (ProxyRegTruncates[i]) {
2188 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2189 }
2190
2191 InVals.push_back(Ret);
2192 }
2193
2194 for (SDValue &T : TempProxyRegOps) {
2195 SDValue Repl = DAG.getNode(
2197 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
2198 {Chain, T.getOperand(0), InGlue});
2199 DAG.ReplaceAllUsesWith(T, Repl);
2200 DAG.RemoveDeadNode(T.getNode());
2201
2202 Chain = Repl.getValue(1);
2203 InGlue = Repl.getValue(2);
2204 }
2205
2206 // set isTailCall to false for now, until we figure out how to express
2207 // tail call optimization in PTX
2208 isTailCall = false;
2209 return Chain;
2210}
2211
2213 SelectionDAG &DAG) const {
2214
2215 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2216 const Function &Fn = DAG.getMachineFunction().getFunction();
2217
2218 DiagnosticInfoUnsupported NoDynamicAlloca(
2219 Fn,
2220 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2221 "requires target sm_52.",
2222 SDLoc(Op).getDebugLoc());
2223 DAG.getContext()->diagnose(NoDynamicAlloca);
2224 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2225 Op.getOperand(0)};
2226 return DAG.getMergeValues(Ops, SDLoc());
2227 }
2228
2229 SDValue Chain = Op.getOperand(0);
2230 SDValue Size = Op.getOperand(1);
2231 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2232 SDLoc DL(Op.getNode());
2233
2234 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2235 if (nvTM->is64Bit())
2236 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
2237 else
2238 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
2239
2240 SDValue AllocOps[] = {Chain, Size,
2241 DAG.getTargetConstant(Align, DL, MVT::i32)};
2243 nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
2244
2245 SDValue MergeOps[] = {Alloca, Chain};
2246 return DAG.getMergeValues(MergeOps, DL);
2247}
2248
2249// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2250// (see LegalizeDAG.cpp). This is slow and uses local memory.
2251// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2252SDValue
2253NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2254 SDNode *Node = Op.getNode();
2255 SDLoc dl(Node);
2257 unsigned NumOperands = Node->getNumOperands();
2258 for (unsigned i = 0; i < NumOperands; ++i) {
2259 SDValue SubOp = Node->getOperand(i);
2260 EVT VVT = SubOp.getNode()->getValueType(0);
2261 EVT EltVT = VVT.getVectorElementType();
2262 unsigned NumSubElem = VVT.getVectorNumElements();
2263 for (unsigned j = 0; j < NumSubElem; ++j) {
2264 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2265 DAG.getIntPtrConstant(j, dl)));
2266 }
2267 }
2268 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2269}
2270
2271// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2272// would get lowered as two constant loads and vector-packing move.
2273// Instead we want just a constant move:
2274// mov.b32 %r2, 0x40003C00
2275SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2276 SelectionDAG &DAG) const {
2277 EVT VT = Op->getValueType(0);
2278 if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2279 return Op;
2280
2281 SDLoc DL(Op);
2282
2283 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2284 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2285 isa<ConstantFPSDNode>(Operand);
2286 })) {
2287 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2288 // to optimize calculation of constant parts.
2289 if (VT == MVT::v4i8) {
2290 SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2291 SDValue E01 = DAG.getNode(
2292 NVPTXISD::BFI, DL, MVT::i32,
2293 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2294 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2295 SDValue E012 =
2296 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2297 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2298 E01, DAG.getConstant(16, DL, MVT::i32), C8);
2299 SDValue E0123 =
2300 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2301 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2302 E012, DAG.getConstant(24, DL, MVT::i32), C8);
2303 return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2304 }
2305 return Op;
2306 }
2307
2308 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2309 auto GetOperand = [](SDValue Op, int N) -> APInt {
2310 const SDValue &Operand = Op->getOperand(N);
2311 EVT VT = Op->getValueType(0);
2312 if (Operand->isUndef())
2313 return APInt(32, 0);
2314 APInt Value;
2315 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2316 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2317 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2318 Value = Operand->getAsAPIntVal();
2319 else
2320 llvm_unreachable("Unsupported type");
2321 // i8 values are carried around as i16, so we need to zero out upper bits,
2322 // so they do not get in the way of combining individual byte values
2323 if (VT == MVT::v4i8)
2324 Value = Value.trunc(8);
2325 return Value.zext(32);
2326 };
2327 APInt Value;
2328 if (Isv2x16VT(VT)) {
2329 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2330 } else if (VT == MVT::v4i8) {
2331 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2332 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2333 } else {
2334 llvm_unreachable("Unsupported type");
2335 }
2336 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2337 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2338}
2339
2340SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2341 SelectionDAG &DAG) const {
2342 SDValue Index = Op->getOperand(1);
2343 SDValue Vector = Op->getOperand(0);
2344 SDLoc DL(Op);
2345 EVT VectorVT = Vector.getValueType();
2346
2347 if (VectorVT == MVT::v4i8) {
2348 SDValue BFE =
2349 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2350 {Vector,
2351 DAG.getNode(ISD::MUL, DL, MVT::i32,
2352 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2353 DAG.getConstant(8, DL, MVT::i32)),
2354 DAG.getConstant(8, DL, MVT::i32)});
2355 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2356 }
2357
2358 // Constant index will be matched by tablegen.
2359 if (isa<ConstantSDNode>(Index.getNode()))
2360 return Op;
2361
2362 // Extract individual elements and select one of them.
2363 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2364 EVT EltVT = VectorVT.getVectorElementType();
2365
2366 SDLoc dl(Op.getNode());
2367 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2368 DAG.getIntPtrConstant(0, dl));
2369 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2370 DAG.getIntPtrConstant(1, dl));
2371 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2373}
2374
2375SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2376 SelectionDAG &DAG) const {
2377 SDValue Vector = Op->getOperand(0);
2378 EVT VectorVT = Vector.getValueType();
2379
2380 if (VectorVT != MVT::v4i8)
2381 return Op;
2382 SDLoc DL(Op);
2383 SDValue Value = Op->getOperand(1);
2384 if (Value->isUndef())
2385 return Vector;
2386
2387 SDValue Index = Op->getOperand(2);
2388
2389 SDValue BFI =
2390 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2391 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2392 DAG.getNode(ISD::MUL, DL, MVT::i32,
2393 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2394 DAG.getConstant(8, DL, MVT::i32)),
2395 DAG.getConstant(8, DL, MVT::i32)});
2396 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2397}
2398
2399SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2400 SelectionDAG &DAG) const {
2401 SDValue V1 = Op.getOperand(0);
2402 EVT VectorVT = V1.getValueType();
2403 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2404 return Op;
2405
2406 // Lower shuffle to PRMT instruction.
2407 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2408 SDValue V2 = Op.getOperand(1);
2409 uint32_t Selector = 0;
2410 for (auto I : llvm::enumerate(SVN->getMask())) {
2411 if (I.value() != -1) // -1 is a placeholder for undef.
2412 Selector |= (I.value() << (I.index() * 4));
2413 }
2414
2415 SDLoc DL(Op);
2416 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2417 DAG.getConstant(Selector, DL, MVT::i32),
2418 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2419}
2420/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2421/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2422/// amount, or
2423/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2424/// amount.
2425SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2426 SelectionDAG &DAG) const {
2427 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2428 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2429
2430 EVT VT = Op.getValueType();
2431 unsigned VTBits = VT.getSizeInBits();
2432 SDLoc dl(Op);
2433 SDValue ShOpLo = Op.getOperand(0);
2434 SDValue ShOpHi = Op.getOperand(1);
2435 SDValue ShAmt = Op.getOperand(2);
2436 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2437
2438 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2439 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2440 // {dHi, dLo} = {aHi, aLo} >> Amt
2441 // dHi = aHi >> Amt
2442 // dLo = shf.r.clamp aLo, aHi, Amt
2443
2444 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2445 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2446 ShAmt);
2447
2448 SDValue Ops[2] = { Lo, Hi };
2449 return DAG.getMergeValues(Ops, dl);
2450 }
2451 else {
2452 // {dHi, dLo} = {aHi, aLo} >> Amt
2453 // - if (Amt>=size) then
2454 // dLo = aHi >> (Amt-size)
2455 // dHi = aHi >> Amt (this is either all 0 or all 1)
2456 // else
2457 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2458 // dHi = aHi >> Amt
2459
2460 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2461 DAG.getConstant(VTBits, dl, MVT::i32),
2462 ShAmt);
2463 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2464 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2465 DAG.getConstant(VTBits, dl, MVT::i32));
2466 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2467 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2468 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2469
2470 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2471 DAG.getConstant(VTBits, dl, MVT::i32),
2472 ISD::SETGE);
2473 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2474 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2475
2476 SDValue Ops[2] = { Lo, Hi };
2477 return DAG.getMergeValues(Ops, dl);
2478 }
2479}
2480
2481/// LowerShiftLeftParts - Lower SHL_PARTS, which
2482/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2483/// amount, or
2484/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2485/// amount.
2486SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2487 SelectionDAG &DAG) const {
2488 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2489 assert(Op.getOpcode() == ISD::SHL_PARTS);
2490
2491 EVT VT = Op.getValueType();
2492 unsigned VTBits = VT.getSizeInBits();
2493 SDLoc dl(Op);
2494 SDValue ShOpLo = Op.getOperand(0);
2495 SDValue ShOpHi = Op.getOperand(1);
2496 SDValue ShAmt = Op.getOperand(2);
2497
2498 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2499 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2500 // {dHi, dLo} = {aHi, aLo} << Amt
2501 // dHi = shf.l.clamp aLo, aHi, Amt
2502 // dLo = aLo << Amt
2503
2504 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2505 ShAmt);
2506 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2507
2508 SDValue Ops[2] = { Lo, Hi };
2509 return DAG.getMergeValues(Ops, dl);
2510 }
2511 else {
2512 // {dHi, dLo} = {aHi, aLo} << Amt
2513 // - if (Amt>=size) then
2514 // dLo = aLo << Amt (all 0)
2515 // dLo = aLo << (Amt-size)
2516 // else
2517 // dLo = aLo << Amt
2518 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2519
2520 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2521 DAG.getConstant(VTBits, dl, MVT::i32),
2522 ShAmt);
2523 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2524 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2525 DAG.getConstant(VTBits, dl, MVT::i32));
2526 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2527 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2528 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2529
2530 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2531 DAG.getConstant(VTBits, dl, MVT::i32),
2532 ISD::SETGE);
2533 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2534 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2535
2536 SDValue Ops[2] = { Lo, Hi };
2537 return DAG.getMergeValues(Ops, dl);
2538 }
2539}
2540
2541SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2542 EVT VT = Op.getValueType();
2543
2544 if (VT == MVT::f32)
2545 return LowerFROUND32(Op, DAG);
2546
2547 if (VT == MVT::f64)
2548 return LowerFROUND64(Op, DAG);
2549
2550 llvm_unreachable("unhandled type");
2551}
2552
2553// This is the the rounding method used in CUDA libdevice in C like code:
2554// float roundf(float A)
2555// {
2556// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2557// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2558// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2559// }
2560SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2561 SelectionDAG &DAG) const {
2562 SDLoc SL(Op);
2563 SDValue A = Op.getOperand(0);
2564 EVT VT = Op.getValueType();
2565
2566 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2567
2568 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2569 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2570 const int SignBitMask = 0x80000000;
2571 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2572 DAG.getConstant(SignBitMask, SL, MVT::i32));
2573 const int PointFiveInBits = 0x3F000000;
2574 SDValue PointFiveWithSignRaw =
2575 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2576 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2577 SDValue PointFiveWithSign =
2578 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2579 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2580 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2581
2582 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2583 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2584 SDValue IsLarge =
2585 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2586 ISD::SETOGT);
2587 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2588
2589 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2590 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2591 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2592 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2593 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2594}
2595
2596// The implementation of round(double) is similar to that of round(float) in
2597// that they both separate the value range into three regions and use a method
2598// specific to the region to round the values. However, round(double) first
2599// calculates the round of the absolute value and then adds the sign back while
2600// round(float) directly rounds the value with sign.
2601SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2602 SelectionDAG &DAG) const {
2603 SDLoc SL(Op);
2604 SDValue A = Op.getOperand(0);
2605 EVT VT = Op.getValueType();
2606
2607 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2608
2609 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2610 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2611 DAG.getConstantFP(0.5, SL, VT));
2612 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2613
2614 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2615 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2616 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2617 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2618 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2619 DAG.getConstantFP(0, SL, VT),
2620 RoundedA);
2621
2622 // Add sign to rounded_A
2623 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2624 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2625
2626 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2627 SDValue IsLarge =
2628 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2629 ISD::SETOGT);
2630 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2631}
2632
2633SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2634 SelectionDAG &DAG) const {
2635 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2636
2637 if (Op.getValueType() == MVT::bf16) {
2638 SDLoc Loc(Op);
2639 return DAG.getNode(
2640 ISD::FP_ROUND, Loc, MVT::bf16,
2641 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2642 DAG.getIntPtrConstant(0, Loc));
2643 }
2644
2645 // Everything else is considered legal.
2646 return Op;
2647}
2648
2649SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2650 SelectionDAG &DAG) const {
2651 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2652
2653 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2654 SDLoc Loc(Op);
2655 return DAG.getNode(
2656 Op.getOpcode(), Loc, Op.getValueType(),
2657 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2658 }
2659
2660 // Everything else is considered legal.
2661 return Op;
2662}
2663
2664SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2665 SelectionDAG &DAG) const {
2666 EVT NarrowVT = Op.getValueType();
2667 SDValue Wide = Op.getOperand(0);
2668 EVT WideVT = Wide.getValueType();
2669 if (NarrowVT.getScalarType() == MVT::bf16) {
2670 const TargetLowering *TLI = STI.getTargetLowering();
2671 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2672 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2673 }
2674 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2675 // This combination was the first to support f32 -> bf16.
2676 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2677 if (WideVT.getScalarType() == MVT::f32) {
2678 return Op;
2679 }
2680 if (WideVT.getScalarType() == MVT::f64) {
2681 SDLoc Loc(Op);
2682 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2683 // the hardware f32 -> bf16 instruction.
2685 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2686 : MVT::f32,
2687 Wide, Loc, DAG);
2688 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2689 }
2690 }
2691 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2692 }
2693 }
2694
2695 // Everything else is considered legal.
2696 return Op;
2697}
2698
2699SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2700 SelectionDAG &DAG) const {
2701 SDValue Narrow = Op.getOperand(0);
2702 EVT NarrowVT = Narrow.getValueType();
2703 EVT WideVT = Op.getValueType();
2704 if (NarrowVT.getScalarType() == MVT::bf16) {
2705 if (WideVT.getScalarType() == MVT::f32 &&
2706 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2707 SDLoc Loc(Op);
2708 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2709 }
2710 if (WideVT.getScalarType() == MVT::f64 &&
2711 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2712 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2713 : MVT::f32;
2714 SDLoc Loc(Op);
2715 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2716 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2717 } else {
2718 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2719 }
2720 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2721 }
2722 }
2723
2724 // Everything else is considered legal.
2725 return Op;
2726}
2727
2729 SDLoc DL(Op);
2730 if (Op.getValueType() != MVT::v2i16)
2731 return Op;
2732 EVT EltVT = Op.getValueType().getVectorElementType();
2733 SmallVector<SDValue> VecElements;
2734 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2735 SmallVector<SDValue> ScalarArgs;
2736 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2737 [&](const SDUse &O) {
2738 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2739 O.get(), DAG.getIntPtrConstant(I, DL));
2740 });
2741 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2742 }
2743 SDValue V =
2744 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2745 return V;
2746}
2747
2748SDValue
2750 switch (Op.getOpcode()) {
2751 case ISD::RETURNADDR:
2752 return SDValue();
2753 case ISD::FRAMEADDR:
2754 return SDValue();
2755 case ISD::GlobalAddress:
2756 return LowerGlobalAddress(Op, DAG);
2758 return Op;
2759 case ISD::BUILD_VECTOR:
2760 return LowerBUILD_VECTOR(Op, DAG);
2762 return Op;
2764 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2766 return LowerINSERT_VECTOR_ELT(Op, DAG);
2768 return LowerVECTOR_SHUFFLE(Op, DAG);
2770 return LowerCONCAT_VECTORS(Op, DAG);
2771 case ISD::STORE:
2772 return LowerSTORE(Op, DAG);
2773 case ISD::LOAD:
2774 return LowerLOAD(Op, DAG);
2775 case ISD::SHL_PARTS:
2776 return LowerShiftLeftParts(Op, DAG);
2777 case ISD::SRA_PARTS:
2778 case ISD::SRL_PARTS:
2779 return LowerShiftRightParts(Op, DAG);
2780 case ISD::SELECT:
2781 return LowerSelect(Op, DAG);
2782 case ISD::FROUND:
2783 return LowerFROUND(Op, DAG);
2784 case ISD::SINT_TO_FP:
2785 case ISD::UINT_TO_FP:
2786 return LowerINT_TO_FP(Op, DAG);
2787 case ISD::FP_TO_SINT:
2788 case ISD::FP_TO_UINT:
2789 return LowerFP_TO_INT(Op, DAG);
2790 case ISD::FP_ROUND:
2791 return LowerFP_ROUND(Op, DAG);
2792 case ISD::FP_EXTEND:
2793 return LowerFP_EXTEND(Op, DAG);
2794 case ISD::VAARG:
2795 return LowerVAARG(Op, DAG);
2796 case ISD::VASTART:
2797 return LowerVASTART(Op, DAG);
2798 case ISD::ABS:
2799 case ISD::SMIN:
2800 case ISD::SMAX:
2801 case ISD::UMIN:
2802 case ISD::UMAX:
2803 case ISD::ADD:
2804 case ISD::SUB:
2805 case ISD::MUL:
2806 case ISD::SHL:
2807 case ISD::SREM:
2808 case ISD::UREM:
2809 return LowerVectorArith(Op, DAG);
2811 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2812 case ISD::CopyToReg:
2813 return LowerCopyToReg_128(Op, DAG);
2814 default:
2815 llvm_unreachable("Custom lowering not defined for operation");
2816 }
2817}
2818
2819// This function is almost a copy of SelectionDAG::expandVAArg().
2820// The only diff is that this one produces loads from local address space.
2821SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2822 const TargetLowering *TLI = STI.getTargetLowering();
2823 SDLoc DL(Op);
2824
2825 SDNode *Node = Op.getNode();
2826 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2827 EVT VT = Node->getValueType(0);
2828 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2829 SDValue Tmp1 = Node->getOperand(0);
2830 SDValue Tmp2 = Node->getOperand(1);
2831 const MaybeAlign MA(Node->getConstantOperandVal(3));
2832
2833 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2834 Tmp1, Tmp2, MachinePointerInfo(V));
2835 SDValue VAList = VAListLoad;
2836
2837 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2838 VAList = DAG.getNode(
2839 ISD::ADD, DL, VAList.getValueType(), VAList,
2840 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2841
2842 VAList = DAG.getNode(
2843 ISD::AND, DL, VAList.getValueType(), VAList,
2844 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2845 }
2846
2847 // Increment the pointer, VAList, to the next vaarg
2848 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2850 DL, VAList.getValueType()));
2851
2852 // Store the incremented VAList to the legalized pointer
2853 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2855
2856 const Value *SrcV =
2858
2859 // Load the actual argument out of the pointer VAList
2860 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2861}
2862
2863SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2864 const TargetLowering *TLI = STI.getTargetLowering();
2865 SDLoc DL(Op);
2866 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2867
2868 // Store the address of unsized array <function>_vararg[] in the ap object.
2869 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2870 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2871
2872 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2873 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2874 MachinePointerInfo(SV));
2875}
2876
2877SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2878 SDValue Op0 = Op->getOperand(0);
2879 SDValue Op1 = Op->getOperand(1);
2880 SDValue Op2 = Op->getOperand(2);
2881 SDLoc DL(Op.getNode());
2882
2883 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2884
2885 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2886 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2887 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2888 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2889
2890 return Trunc;
2891}
2892
2893SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2894 if (Op.getValueType() == MVT::i1)
2895 return LowerLOADi1(Op, DAG);
2896
2897 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2898 // unaligned loads and have to handle it here.
2899 EVT VT = Op.getValueType();
2900 if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2901 LoadSDNode *Load = cast<LoadSDNode>(Op);
2902 EVT MemVT = Load->getMemoryVT();
2904 MemVT, *Load->getMemOperand())) {
2905 SDValue Ops[2];
2906 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2907 return DAG.getMergeValues(Ops, SDLoc(Op));
2908 }
2909 }
2910
2911 return SDValue();
2912}
2913
2914// v = ld i1* addr
2915// =>
2916// v1 = ld i8* addr (-> i16)
2917// v = trunc i16 to i1
2918SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2919 SDNode *Node = Op.getNode();
2920 LoadSDNode *LD = cast<LoadSDNode>(Node);
2921 SDLoc dl(Node);
2922 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2923 assert(Node->getValueType(0) == MVT::i1 &&
2924 "Custom lowering for i1 load only");
2925 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
2926 LD->getBasePtr(), LD->getPointerInfo(),
2927 MVT::i8, LD->getAlign(),
2928 LD->getMemOperand()->getFlags());
2929 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2930 // The legalizer (the caller) is expecting two values from the legalized
2931 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2932 // in LegalizeDAG.cpp which also uses MergeValues.
2933 SDValue Ops[] = { result, LD->getChain() };
2934 return DAG.getMergeValues(Ops, dl);
2935}
2936
2937SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2938 StoreSDNode *Store = cast<StoreSDNode>(Op);
2939 EVT VT = Store->getMemoryVT();
2940
2941 if (VT == MVT::i1)
2942 return LowerSTOREi1(Op, DAG);
2943
2944 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2945 // stores and have to handle it here.
2946 if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2948 VT, *Store->getMemOperand()))
2949 return expandUnalignedStore(Store, DAG);
2950
2951 // v2f16, v2bf16 and v2i16 don't need special handling.
2952 if (Isv2x16VT(VT) || VT == MVT::v4i8)
2953 return SDValue();
2954
2955 if (VT.isVector())
2956 return LowerSTOREVector(Op, DAG);
2957
2958 return SDValue();
2959}
2960
2961SDValue
2962NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2963 SDNode *N = Op.getNode();
2964 SDValue Val = N->getOperand(1);
2965 SDLoc DL(N);
2966 EVT ValVT = Val.getValueType();
2967
2968 if (ValVT.isVector()) {
2969 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2970 // legal. We can (and should) split that into 2 stores of <2 x double> here
2971 // but I'm leaving that as a TODO for now.
2972 if (!ValVT.isSimple())
2973 return SDValue();
2974 switch (ValVT.getSimpleVT().SimpleTy) {
2975 default:
2976 return SDValue();
2977 case MVT::v2i8:
2978 case MVT::v2i16:
2979 case MVT::v2i32:
2980 case MVT::v2i64:
2981 case MVT::v2f16:
2982 case MVT::v2bf16:
2983 case MVT::v2f32:
2984 case MVT::v2f64:
2985 case MVT::v4i8:
2986 case MVT::v4i16:
2987 case MVT::v4i32:
2988 case MVT::v4f16:
2989 case MVT::v4bf16:
2990 case MVT::v4f32:
2991 case MVT::v8f16: // <4 x f16x2>
2992 case MVT::v8bf16: // <4 x bf16x2>
2993 case MVT::v8i16: // <4 x i16x2>
2994 // This is a "native" vector type
2995 break;
2996 }
2997
2998 MemSDNode *MemSD = cast<MemSDNode>(N);
2999 const DataLayout &TD = DAG.getDataLayout();
3000
3001 Align Alignment = MemSD->getAlign();
3002 Align PrefAlign =
3003 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3004 if (Alignment < PrefAlign) {
3005 // This store is not sufficiently aligned, so bail out and let this vector
3006 // store be scalarized. Note that we may still be able to emit smaller
3007 // vector stores. For example, if we are storing a <4 x float> with an
3008 // alignment of 8, this check will fail but the legalizer will try again
3009 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3010 return SDValue();
3011 }
3012
3013 unsigned Opcode = 0;
3014 EVT EltVT = ValVT.getVectorElementType();
3015 unsigned NumElts = ValVT.getVectorNumElements();
3016
3017 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3018 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3019 // stored type to i16 and propagate the "real" type as the memory type.
3020 bool NeedExt = false;
3021 if (EltVT.getSizeInBits() < 16)
3022 NeedExt = true;
3023
3024 bool StoreF16x2 = false;
3025 switch (NumElts) {
3026 default:
3027 return SDValue();
3028 case 2:
3029 Opcode = NVPTXISD::StoreV2;
3030 break;
3031 case 4:
3032 Opcode = NVPTXISD::StoreV4;
3033 break;
3034 case 8:
3035 // v8f16 is a special case. PTX doesn't have st.v8.f16
3036 // instruction. Instead, we split the vector into v2f16 chunks and
3037 // store them with st.v4.b32.
3038 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
3039 Opcode = NVPTXISD::StoreV4;
3040 StoreF16x2 = true;
3041 break;
3042 }
3043
3045
3046 // First is the chain
3047 Ops.push_back(N->getOperand(0));
3048
3049 if (StoreF16x2) {
3050 // Combine f16,f16 -> v2f16
3051 NumElts /= 2;
3052 for (unsigned i = 0; i < NumElts; ++i) {
3053 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3054 DAG.getIntPtrConstant(i * 2, DL));
3055 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3056 DAG.getIntPtrConstant(i * 2 + 1, DL));
3057 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
3058 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
3059 Ops.push_back(V2);
3060 }
3061 } else {
3062 // Then the split values
3063 for (unsigned i = 0; i < NumElts; ++i) {
3064 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3065 DAG.getIntPtrConstant(i, DL));
3066 if (NeedExt)
3067 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3068 Ops.push_back(ExtVal);
3069 }
3070 }
3071
3072 // Then any remaining arguments
3073 Ops.append(N->op_begin() + 2, N->op_end());
3074
3075 SDValue NewSt =
3076 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3077 MemSD->getMemoryVT(), MemSD->getMemOperand());
3078
3079 // return DCI.CombineTo(N, NewSt, true);
3080 return NewSt;
3081 }
3082
3083 return SDValue();
3084}
3085
3086// st i1 v, addr
3087// =>
3088// v1 = zxt v to i16
3089// st.u8 i16, addr
3090SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3091 SDNode *Node = Op.getNode();
3092 SDLoc dl(Node);
3093 StoreSDNode *ST = cast<StoreSDNode>(Node);
3094 SDValue Tmp1 = ST->getChain();
3095 SDValue Tmp2 = ST->getBasePtr();
3096 SDValue Tmp3 = ST->getValue();
3097 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3098 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3099 SDValue Result =
3100 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3101 ST->getAlign(), ST->getMemOperand()->getFlags());
3102 return Result;
3103}
3104
3105SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3106 SelectionDAG &DAG) const {
3107 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3108 // operand so that it can pass the legalization.
3109
3110 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3111 "Custom lowering for 128-bit CopyToReg only");
3112
3113 SDNode *Node = Op.getNode();
3114 SDLoc DL(Node);
3115
3116 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3117 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3118 DAG.getIntPtrConstant(0, DL));
3119 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3120 DAG.getIntPtrConstant(1, DL));
3121
3123 SmallVector<EVT, 3> ResultsType(Node->values());
3124
3125 NewOps[0] = Op->getOperand(0); // Chain
3126 NewOps[1] = Op->getOperand(1); // Dst Reg
3127 NewOps[2] = Lo; // Lower 64-bit
3128 NewOps[3] = Hi; // Higher 64-bit
3129 if (Op.getNumOperands() == 4)
3130 NewOps[4] = Op->getOperand(3); // Glue if exists
3131
3132 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3133}
3134
3135unsigned NVPTXTargetLowering::getNumRegisters(
3136 LLVMContext &Context, EVT VT,
3137 std::optional<MVT> RegisterVT = std::nullopt) const {
3138 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3139 return 1;
3140 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3141}
3142
3143bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3144 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3145 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3146 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3147 Parts[0] = Val;
3148 return true;
3149 }
3150 return false;
3151}
3152
3153// This creates target external symbol for a function parameter.
3154// Name of the symbol is composed from its index and the function name.
3155// Negative index corresponds to special parameter (unsized array) used for
3156// passing variable arguments.
3157SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3158 EVT v) const {
3159 StringRef SavedStr = nvTM->getStrPool().save(
3161 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3162}
3163
3165 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3166 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3167 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3169 const DataLayout &DL = DAG.getDataLayout();
3170 auto PtrVT = getPointerTy(DAG.getDataLayout());
3171
3172 const Function *F = &MF.getFunction();
3173 const AttributeList &PAL = F->getAttributes();
3174 const TargetLowering *TLI = STI.getTargetLowering();
3175
3176 SDValue Root = DAG.getRoot();
3177 std::vector<SDValue> OutChains;
3178
3179 bool isABI = (STI.getSmVersion() >= 20);
3180 assert(isABI && "Non-ABI compilation is not supported");
3181 if (!isABI)
3182 return Chain;
3183
3184 std::vector<Type *> argTypes;
3185 std::vector<const Argument *> theArgs;
3186 for (const Argument &I : F->args()) {
3187 theArgs.push_back(&I);
3188 argTypes.push_back(I.getType());
3189 }
3190 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3191 // Ins.size() will be larger
3192 // * if there is an aggregate argument with multiple fields (each field
3193 // showing up separately in Ins)
3194 // * if there is a vector argument with more than typical vector-length
3195 // elements (generally if more than 4) where each vector element is
3196 // individually present in Ins.
3197 // So a different index should be used for indexing into Ins.
3198 // See similar issue in LowerCall.
3199 unsigned InsIdx = 0;
3200
3201 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3202 Type *Ty = argTypes[i];
3203
3204 if (theArgs[i]->use_empty()) {
3205 // argument is dead
3206 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3207 SmallVector<EVT, 16> vtparts;
3208
3209 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3210 if (vtparts.empty())
3211 report_fatal_error("Empty parameter types are not supported");
3212
3213 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3214 ++parti) {
3215 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3216 ++InsIdx;
3217 }
3218 if (vtparts.size() > 0)
3219 --InsIdx;
3220 continue;
3221 }
3222 if (Ty->isVectorTy()) {
3223 EVT ObjectVT = getValueType(DL, Ty);
3224 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3225 for (unsigned parti = 0; parti < NumRegs; ++parti) {
3226 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3227 ++InsIdx;
3228 }
3229 if (NumRegs > 0)
3230 --InsIdx;
3231 continue;
3232 }
3233 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3234 continue;
3235 }
3236
3237 // In the following cases, assign a node order of "i+1"
3238 // to newly created nodes. The SDNodes for params have to
3239 // appear in the same order as their order of appearance
3240 // in the original function. "i+1" holds that order.
3241 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3242 bool aggregateIsPacked = false;
3243 if (StructType *STy = dyn_cast<StructType>(Ty))
3244 aggregateIsPacked = STy->isPacked();
3245
3248 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3249 if (VTs.empty())
3250 report_fatal_error("Empty parameter types are not supported");
3251
3254 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3255
3256 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3257 int VecIdx = -1; // Index of the first element of the current vector.
3258 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3259 if (VectorInfo[parti] & PVF_FIRST) {
3260 assert(VecIdx == -1 && "Orphaned vector.");
3261 VecIdx = parti;
3262 }
3263
3264 // That's the last element of this store op.
3265 if (VectorInfo[parti] & PVF_LAST) {
3266 unsigned NumElts = parti - VecIdx + 1;
3267 EVT EltVT = VTs[parti];
3268 // i1 is loaded/stored as i8.
3269 EVT LoadVT = EltVT;
3270 if (EltVT == MVT::i1)
3271 LoadVT = MVT::i8;
3272 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3273 // getLoad needs a vector type, but it can't handle
3274 // vectors which contain v2f16 or v2bf16 elements. So we must load
3275 // using i32 here and then bitcast back.
3276 LoadVT = MVT::i32;
3277
3278 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3279 SDValue VecAddr =
3280 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3281 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3283 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3284
3285 const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3286 if (aggregateIsPacked)
3287 return Align(1);
3288 if (NumElts != 1)
3289 return std::nullopt;
3290 Align PartAlign =
3291 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3292 return commonAlignment(PartAlign, Offsets[parti]);
3293 }();
3294 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3295 MachinePointerInfo(srcValue), PartAlign,
3298 if (P.getNode())
3299 P.getNode()->setIROrder(i + 1);
3300 for (unsigned j = 0; j < NumElts; ++j) {
3301 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3302 DAG.getIntPtrConstant(j, dl));
3303 // We've loaded i1 as an i8 and now must truncate it back to i1
3304 if (EltVT == MVT::i1)
3305 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3306 // v2f16 was loaded as an i32. Now we must bitcast it back.
3307 else if (EltVT != LoadVT)
3308 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3309
3310 // If a promoted integer type is used, truncate down to the original
3311 MVT PromotedVT;
3312 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3313 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3314 }
3315
3316 // Extend the element if necessary (e.g. an i8 is loaded
3317 // into an i16 register)
3318 if (Ins[InsIdx].VT.isInteger() &&
3319 Ins[InsIdx].VT.getFixedSizeInBits() >
3320 LoadVT.getFixedSizeInBits()) {
3321 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3323 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3324 }
3325 InVals.push_back(Elt);
3326 }
3327
3328 // Reset vector tracking state.
3329 VecIdx = -1;
3330 }
3331 ++InsIdx;
3332 }
3333 if (VTs.size() > 0)
3334 --InsIdx;
3335 continue;
3336 }
3337
3338 // Param has ByVal attribute
3339 // Return MoveParam(param symbol).
3340 // Ideally, the param symbol can be returned directly,
3341 // but when SDNode builder decides to use it in a CopyToReg(),
3342 // machine instruction fails because TargetExternalSymbol
3343 // (not lowered) is target dependent, and CopyToReg assumes
3344 // the source is lowered.
3345 EVT ObjectVT = getValueType(DL, Ty);
3346 assert(ObjectVT == Ins[InsIdx].VT &&
3347 "Ins type did not match function type");
3348 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3349 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3350 if (p.getNode())
3351 p.getNode()->setIROrder(i + 1);
3352 InVals.push_back(p);
3353 }
3354
3355 if (!OutChains.empty())
3356 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3357
3358 return Chain;
3359}
3360
3361// Use byte-store when the param adress of the return value is unaligned.
3362// This may happen when the return value is a field of a packed structure.
3364 uint64_t Offset, EVT ElementType,
3365 SDValue RetVal, const SDLoc &dl) {
3366 // Bit logic only works on integer types
3367 if (adjustElementType(ElementType))
3368 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3369
3370 // Store each byte
3371 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3372 // Shift the byte to the last byte position
3373 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3374 DAG.getConstant(i * 8, dl, MVT::i32));
3375 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3376 ShiftVal};
3377 // Trunc store only the last byte by using
3378 // st.param.b8
3379 // The register type can be larger than b8.
3381 DAG.getVTList(MVT::Other), StoreOperands,
3382 MVT::i8, MachinePointerInfo(), std::nullopt,
3384 }
3385 return Chain;
3386}
3387
3388SDValue
3390 bool isVarArg,
3392 const SmallVectorImpl<SDValue> &OutVals,
3393 const SDLoc &dl, SelectionDAG &DAG) const {
3394 const MachineFunction &MF = DAG.getMachineFunction();
3395 const Function &F = MF.getFunction();
3397
3398 bool isABI = (STI.getSmVersion() >= 20);
3399 assert(isABI && "Non-ABI compilation is not supported");
3400 if (!isABI)
3401 return Chain;
3402
3403 const DataLayout &DL = DAG.getDataLayout();
3404 SmallVector<SDValue, 16> PromotedOutVals;
3407 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3408 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3409
3410 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3411 SDValue PromotedOutVal = OutVals[i];
3412 MVT PromotedVT;
3413 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3414 VTs[i] = EVT(PromotedVT);
3415 }
3416 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3418 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3419 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3420 }
3421 PromotedOutVals.push_back(PromotedOutVal);
3422 }
3423
3424 auto VectorInfo = VectorizePTXValueVTs(
3425 VTs, Offsets,
3427 : Align(1));
3428
3429 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3430 // 32-bits are sign extended or zero extended, depending on whether
3431 // they are signed or unsigned types.
3432 bool ExtendIntegerRetVal =
3433 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3434
3435 SmallVector<SDValue, 6> StoreOperands;
3436 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3437 SDValue OutVal = OutVals[i];
3438 SDValue RetVal = PromotedOutVals[i];
3439
3440 if (ExtendIntegerRetVal) {
3441 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3443 dl, MVT::i32, RetVal);
3444 } else if (OutVal.getValueSizeInBits() < 16) {
3445 // Use 16-bit registers for small load-stores as it's the
3446 // smallest general purpose register size supported by NVPTX.
3447 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3448 }
3449
3450 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3451 // for a scalar store. In such cases, fall back to byte stores.
3452 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3453 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3454 Align ElementTypeAlign =
3455 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3456 Align ElementAlign =
3457 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3458 if (ElementAlign < ElementTypeAlign) {
3459 assert(StoreOperands.empty() && "Orphaned operand list.");
3460 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3461 RetVal, dl);
3462
3463 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3464 // into the graph, so just move on to the next element.
3465 continue;
3466 }
3467 }
3468
3469 // New load/store. Record chain and offset operands.
3470 if (VectorInfo[i] & PVF_FIRST) {
3471 assert(StoreOperands.empty() && "Orphaned operand list.");
3472 StoreOperands.push_back(Chain);
3473 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3474 }
3475
3476 // Record the value to return.
3477 StoreOperands.push_back(RetVal);
3478
3479 // That's the last element of this store op.
3480 if (VectorInfo[i] & PVF_LAST) {
3482 unsigned NumElts = StoreOperands.size() - 2;
3483 switch (NumElts) {
3484 case 1:
3486 break;
3487 case 2:
3489 break;
3490 case 4:
3492 break;
3493 default:
3494 llvm_unreachable("Invalid vector info.");
3495 }
3496
3497 // Adjust type of load/store op if we've extended the scalar
3498 // return value.
3499 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3500 Chain = DAG.getMemIntrinsicNode(
3501 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3503 // Cleanup vector state.
3504 StoreOperands.clear();
3505 }
3506 }
3507
3508 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3509}
3510
3512 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3513 SelectionDAG &DAG) const {
3514 if (Constraint.size() > 1)
3515 return;
3516 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3517}
3518
3519static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3520 switch (Intrinsic) {
3521 default:
3522 return 0;
3523
3524 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3526 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3528 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3530 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3532 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3533 return NVPTXISD::Tex1DS32S32;
3534 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3536 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3538 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3540 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3541 return NVPTXISD::Tex1DU32S32;
3542 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3544 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3546 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3548
3549 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3551 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3553 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3555 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3557 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3559 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3561 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3563 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3565 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3567 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3569 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3571 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3573
3574 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3576 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3578 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3580 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3582 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3583 return NVPTXISD::Tex2DS32S32;
3584 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3586 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3588 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3590 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3591 return NVPTXISD::Tex2DU32S32;
3592 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3594 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3596 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3598
3599 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3601 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3603 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3605 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3607 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3609 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3611 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3613 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3615 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3617 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3619 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3621 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3623
3624 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3626 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3628 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3630 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3632 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3633 return NVPTXISD::Tex3DS32S32;
3634 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3636 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3638 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3640 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3641 return NVPTXISD::Tex3DU32S32;
3642 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3644 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3646 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3648
3649 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3651 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3653 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3655 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3657 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3659 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3661
3662 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3664 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3666 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3668 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3670 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3672 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3674
3675 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3677 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3679 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3681 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3683 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3685 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3687 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3689 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3691 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3693 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3695 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3697 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3699
3700 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3702 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3704 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3706 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3708 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3710 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3712 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3714 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3716 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3718 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3720 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3722 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3724
3725 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3727 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3729 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3731 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3733 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3735 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3737 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3739 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3741 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3743 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3745 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3747 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3749
3750 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3752 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3754 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3756 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3758 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3760 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3762 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3764 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3766 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3768 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3770 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3772 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3774
3775 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3777 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3779 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3781 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3783 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3785 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3787 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3789 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3791 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3793 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3795 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3797 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3799
3800 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3802 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3804 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3806 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3808 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3810 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3812 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3814 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3816 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3818 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3820 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3822 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3824
3825 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3827 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3829 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3831 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3833 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3835 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3837
3838 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3840 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3842 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3844 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3846 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3848 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3850
3851 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3853 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3855 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3857 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3859 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3861 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3863
3864 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3866 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3868 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3870 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3872 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3874 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3876 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3878 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3880 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3882 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3884 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3886 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3888 }
3889}
3890
3891static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3892 switch (Intrinsic) {
3893 default:
3894 return 0;
3895 case Intrinsic::nvvm_suld_1d_i8_clamp:
3897 case Intrinsic::nvvm_suld_1d_i16_clamp:
3899 case Intrinsic::nvvm_suld_1d_i32_clamp:
3901 case Intrinsic::nvvm_suld_1d_i64_clamp:
3903 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3905 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3907 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3909 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3911 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3913 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3915 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3917 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3919 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3921 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3923 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3925 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3927 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3929 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3931 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3933 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3935 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3937 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3939 case Intrinsic::nvvm_suld_2d_i8_clamp:
3941 case Intrinsic::nvvm_suld_2d_i16_clamp:
3943 case Intrinsic::nvvm_suld_2d_i32_clamp:
3945 case Intrinsic::nvvm_suld_2d_i64_clamp:
3947 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3949 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3951 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3953 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3955 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3957 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3959 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3961 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3963 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3965 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3967 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3969 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3971 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3973 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3975 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3977 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3979 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3981 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3983 case Intrinsic::nvvm_suld_3d_i8_clamp:
3985 case Intrinsic::nvvm_suld_3d_i16_clamp:
3987 case Intrinsic::nvvm_suld_3d_i32_clamp:
3989 case Intrinsic::nvvm_suld_3d_i64_clamp:
3991 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3993 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3995 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3997 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3999 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4001 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4003 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4005 case Intrinsic::nvvm_suld_1d_i8_trap:
4007 case Intrinsic::nvvm_suld_1d_i16_trap:
4009 case Intrinsic::nvvm_suld_1d_i32_trap:
4011 case Intrinsic::nvvm_suld_1d_i64_trap:
4013 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4015 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4017 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4019 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4021 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4023 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4025 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4027 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4029 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4031 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4033 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4035 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4037 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4039 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4041 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4043 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4045 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4047 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4049 case Intrinsic::nvvm_suld_2d_i8_trap:
4051 case Intrinsic::nvvm_suld_2d_i16_trap:
4053 case Intrinsic::nvvm_suld_2d_i32_trap:
4055 case Intrinsic::nvvm_suld_2d_i64_trap:
4057 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4059 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4061 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4063 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4065 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4067 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4069 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4071 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4073 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4075 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4077 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4079 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4081 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4083 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4085 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4087 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4089 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4091 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4093 case Intrinsic::nvvm_suld_3d_i8_trap:
4095 case Intrinsic::nvvm_suld_3d_i16_trap:
4097 case Intrinsic::nvvm_suld_3d_i32_trap:
4099 case Intrinsic::nvvm_suld_3d_i64_trap:
4101 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4103 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4105 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4107 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4109 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4111 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4113 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4115 case Intrinsic::nvvm_suld_1d_i8_zero:
4117 case Intrinsic::nvvm_suld_1d_i16_zero:
4119 case Intrinsic::nvvm_suld_1d_i32_zero:
4121 case Intrinsic::nvvm_suld_1d_i64_zero:
4123 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4125 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4127 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4129 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4131 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4133 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4135 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4137 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4139 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4141 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4143 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4145 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4147 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4149 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4151 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4153 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4155 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4157 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4159 case Intrinsic::nvvm_suld_2d_i8_zero:
4161 case Intrinsic::nvvm_suld_2d_i16_zero:
4163 case Intrinsic::nvvm_suld_2d_i32_zero:
4165 case Intrinsic::nvvm_suld_2d_i64_zero:
4167 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4169 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4171 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4173 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4175 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4177 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4179 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4181 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4183 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4185 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4187 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4189 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4191 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4193 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4195 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4197 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4199 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4201 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4203 case Intrinsic::nvvm_suld_3d_i8_zero:
4205 case Intrinsic::nvvm_suld_3d_i16_zero:
4207 case Intrinsic::nvvm_suld_3d_i32_zero:
4209 case Intrinsic::nvvm_suld_3d_i64_zero:
4211 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4213 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4215 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4217 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4219 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4221 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4223 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4225 }
4226}
4227
4228// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4229// TgtMemIntrinsic
4230// because we need the information that is only available in the "Value" type
4231// of destination
4232// pointer. In particular, the address space information.
4234 IntrinsicInfo &Info, const CallInst &I,
4235 MachineFunction &MF, unsigned Intrinsic) const {
4236 switch (Intrinsic) {
4237 default:
4238 return false;
4239 case Intrinsic::nvvm_match_all_sync_i32p:
4240 case Intrinsic::nvvm_match_all_sync_i64p:
4242 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4243 // in order to model data exchange with other threads, but perform no real
4244 // memory accesses.
4245 Info.memVT = MVT::i1;
4246
4247 // Our result depends on both our and other thread's arguments.
4249 return true;
4250 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4251 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4252 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4253 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4254 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4255 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4256 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4257 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4258 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4259 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4260 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4261 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4262 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4263 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4264 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4265 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4266 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4267 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4268 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4269 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4270 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4271 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4272 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4273 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4275 Info.memVT = MVT::v8f16;
4276 Info.ptrVal = I.getArgOperand(0);
4277 Info.offset = 0;
4279 Info.align = Align(16);
4280 return true;
4281 }
4282 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4283 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4284 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4285 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4286 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4287 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4288 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4289 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4290 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4291 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4292 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4293 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4294 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4295 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4296 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4297 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4298 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4299 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4300 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4301 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4302 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4303 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4304 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4305 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4307 Info.memVT = MVT::v2i32;
4308 Info.ptrVal = I.getArgOperand(0);
4309 Info.offset = 0;
4311 Info.align = Align(8);
4312 return true;
4313 }
4314
4315 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4316 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4317 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4318 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4319 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4320 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4321 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4322 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4323 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4324 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4325 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4326 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4327 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4328 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4329 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4330 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4331
4332 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4333 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4334 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4335 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4336 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4337 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4338 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4339 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4340 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4341 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4342 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4343 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4344 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4345 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4346 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4347 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4348 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4349 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4351 Info.memVT = MVT::v4i32;
4352 Info.ptrVal = I.getArgOperand(0);
4353 Info.offset = 0;
4355 Info.align = Align(16);
4356 return true;
4357 }
4358
4359 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4360 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4361 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4362 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4363 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4364 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4365 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4366 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4367
4368 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4369 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4370 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4371 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4372 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4373 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4374 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4375 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4376 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4377 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4378 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4379 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4380 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4381 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4382 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4383 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4384 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4385 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4386 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4387 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4388 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4389 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4391 Info.memVT = MVT::i32;
4392 Info.ptrVal = I.getArgOperand(0);
4393 Info.offset = 0;
4395 Info.align = Align(4);
4396 return true;
4397 }
4398
4399 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4400 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4401 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4402 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4403 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4404 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4405 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4406 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4407 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4408 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4409 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4410 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4412 Info.memVT = MVT::v4f16;
4413 Info.ptrVal = I.getArgOperand(0);
4414 Info.offset = 0;
4416 Info.align = Align(16);
4417 return true;
4418 }
4419
4420 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4421 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4422 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4423 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4424 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4425 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4426 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4427 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4428 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4429 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4430 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4431 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4432 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4433 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4434 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4435 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4437 Info.memVT = MVT::v8f32;
4438 Info.ptrVal = I.getArgOperand(0);
4439 Info.offset = 0;
4441 Info.align = Align(16);
4442 return true;
4443 }
4444
4445 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4446 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4447 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4448 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4449
4450 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4451 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4452 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4453 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4454
4455 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4456 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4457 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4458 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4459 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4460 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4461 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4462 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4463 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4464 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4465 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4466 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4468 Info.memVT = MVT::v8i32;
4469 Info.ptrVal = I.getArgOperand(0);
4470 Info.offset = 0;
4472 Info.align = Align(16);
4473 return true;
4474 }
4475
4476 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4477 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4478 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4479 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4480 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4481 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4482 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4483 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4484 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4485 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4487 Info.memVT = MVT::v2i32;
4488 Info.ptrVal = I.getArgOperand(0);
4489 Info.offset = 0;
4491 Info.align = Align(8);
4492 return true;
4493 }
4494
4495 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4496 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4497 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4498 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4499
4500 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4501 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4502 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4503 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4505 Info.memVT = MVT::f64;
4506 Info.ptrVal = I.getArgOperand(0);
4507 Info.offset = 0;
4509 Info.align = Align(8);
4510 return true;
4511 }
4512
4513 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4514 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4515 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4516 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4518 Info.memVT = MVT::v2f64;
4519 Info.ptrVal = I.getArgOperand(0);
4520 Info.offset = 0;
4522 Info.align = Align(16);
4523 return true;
4524 }
4525
4526 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4527 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4528 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4529 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4530 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4531 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4532 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4533 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4534 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4535 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4536 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4537 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4539 Info.memVT = MVT::v4f16;
4540 Info.ptrVal = I.getArgOperand(0);
4541 Info.offset = 0;
4543 Info.align = Align(16);
4544 return true;
4545 }
4546
4547 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4548 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4549 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4550 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4551 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4552 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4553 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4554 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4555 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4556 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4557 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4558 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4559 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4560 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4561 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4562 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4564 Info.memVT = MVT::v8f32;
4565 Info.ptrVal = I.getArgOperand(0);
4566 Info.offset = 0;
4568 Info.align = Align(16);
4569 return true;
4570 }
4571
4572 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4573 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4574 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4575 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4576 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4577 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4578 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4579 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4580 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4581 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4582 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4583 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4585 Info.memVT = MVT::v8i32;
4586 Info.ptrVal = I.getArgOperand(0);
4587 Info.offset = 0;
4589 Info.align = Align(16);
4590 return true;
4591 }
4592
4593 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4594 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4595 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4596 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4597 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4598 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4599 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4600 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4602 Info.memVT = MVT::v2i32;
4603 Info.ptrVal = I.getArgOperand(0);
4604 Info.offset = 0;
4606 Info.align = Align(8);
4607 return true;
4608 }
4609
4610 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4611 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4612 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4613 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4615 Info.memVT = MVT::v2f64;
4616 Info.ptrVal = I.getArgOperand(0);
4617 Info.offset = 0;
4619 Info.align = Align(16);
4620 return true;
4621 }
4622
4623 case Intrinsic::nvvm_atomic_load_inc_32:
4624 case Intrinsic::nvvm_atomic_load_dec_32:
4625
4626 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4627 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4628 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4629 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4630 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4631 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4632 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4633 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4634 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4635 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4636 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4637 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4638 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4639 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4640 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4641 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4642 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4643 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4644 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4645 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4646 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4647 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4648 auto &DL = I.getDataLayout();
4650 Info.memVT = getValueType(DL, I.getType());
4651 Info.ptrVal = I.getArgOperand(0);
4652 Info.offset = 0;
4654 Info.align.reset();
4655 return true;
4656 }
4657
4658 case Intrinsic::nvvm_ldu_global_i:
4659 case Intrinsic::nvvm_ldu_global_f:
4660 case Intrinsic::nvvm_ldu_global_p: {
4661 auto &DL = I.getDataLayout();
4663 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4664 Info.memVT = getValueType(DL, I.getType());
4665 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4666 Info.memVT = getPointerTy(DL);
4667 else
4668 Info.memVT = getValueType(DL, I.getType());
4669 Info.ptrVal = I.getArgOperand(0);
4670 Info.offset = 0;
4672 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4673
4674 return true;
4675 }
4676 case Intrinsic::nvvm_ldg_global_i:
4677 case Intrinsic::nvvm_ldg_global_f:
4678 case Intrinsic::nvvm_ldg_global_p: {
4679 auto &DL = I.getDataLayout();
4680
4682 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4683 Info.memVT = getValueType(DL, I.getType());
4684 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4685 Info.memVT = getPointerTy(DL);
4686 else
4687 Info.memVT = getValueType(DL, I.getType());
4688 Info.ptrVal = I.getArgOperand(0);
4689 Info.offset = 0;
4691 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4692
4693 return true;
4694 }
4695
4696 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4697 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4698 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4699 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4700 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4701 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4702 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4703 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4704 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4705 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4706 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4707 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4708 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4709 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4710 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4711 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4712 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4713 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4714 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4715 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4716 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4717 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4718 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4719 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4720 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4721 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4722 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4723 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4724 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4725 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4726 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4727 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4728 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4729 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4730 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4731 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4732 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4733 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4734 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4735 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4736 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4737 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4738 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4739 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4740 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4741 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4742 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4743 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4744 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4745 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4746 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4747 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4748 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4749 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4750 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4751 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4752 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4753 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4754 Info.opc = getOpcForTextureInstr(Intrinsic);
4755 Info.memVT = MVT::v4f32;
4756 Info.ptrVal = nullptr;
4757 Info.offset = 0;
4759 Info.align = Align(16);
4760 return true;
4761
4762 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4763 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4764 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4765 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4766 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4767 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4768 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4769 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4770 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4771 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4772 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4773 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4774 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4775 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4776 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4777 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4778 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4779 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4780 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4781 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4782 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4783 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4784 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4785 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4786 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4787 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4788 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4789 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4790 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4791 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4792 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4793 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4794 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4795 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4796 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4797 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4798 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4799 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4800 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4801 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4802 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4803 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4804 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4805 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4806 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4807 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4808 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4809 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4810 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4811 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4812 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4813 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4814 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4815 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4816 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4817 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4818 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4819 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4820 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4821 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4822 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4823 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4824 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4825 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4826 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4827 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4828 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4829 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4830 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4831 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4832 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4833 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4834 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4835 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4836 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4837 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4838 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4839 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4840 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4841 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4842 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4843 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4844 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4845 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4846 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4847 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4848 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4849 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4850 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4851 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4852 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4853 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4854 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4855 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4856 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4857 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4858 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4859 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4860 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4861 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4862 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4863 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4864 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4865 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4866 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4867 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4868 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4869 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4870 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4871 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4872 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4873 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4874 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4875 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4876 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4877 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4878 Info.opc = getOpcForTextureInstr(Intrinsic);
4879 Info.memVT = MVT::v4i32;
4880 Info.ptrVal = nullptr;
4881 Info.offset = 0;
4883 Info.align = Align(16);
4884 return true;
4885
4886 case Intrinsic::nvvm_suld_1d_i8_clamp:
4887 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4888 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4889 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4890 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4891 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4892 case Intrinsic::nvvm_suld_2d_i8_clamp:
4893 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4894 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4895 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4896 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4897 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4898 case Intrinsic::nvvm_suld_3d_i8_clamp:
4899 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4900 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4901 case Intrinsic::nvvm_suld_1d_i8_trap:
4902 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4903 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4904 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4905 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4906 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4907 case Intrinsic::nvvm_suld_2d_i8_trap:
4908 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4909 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4910 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4911 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4912 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4913 case Intrinsic::nvvm_suld_3d_i8_trap:
4914 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4915 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4916 case Intrinsic::nvvm_suld_1d_i8_zero:
4917 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4918 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4919 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4920 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4921 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4922 case Intrinsic::nvvm_suld_2d_i8_zero:
4923 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4924 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4925 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4926 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4927 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4928 case Intrinsic::nvvm_suld_3d_i8_zero:
4929 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4930 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4931 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4932 Info.memVT = MVT::i8;
4933 Info.ptrVal = nullptr;
4934 Info.offset = 0;
4936 Info.align = Align(16);
4937 return true;
4938
4939 case Intrinsic::nvvm_suld_1d_i16_clamp:
4940 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4941 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4942 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4943 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4944 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4945 case Intrinsic::nvvm_suld_2d_i16_clamp:
4946 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4947 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4948 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4949 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4950 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4951 case Intrinsic::nvvm_suld_3d_i16_clamp:
4952 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4953 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4954 case Intrinsic::nvvm_suld_1d_i16_trap:
4955 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4956 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4957 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4958 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4959 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4960 case Intrinsic::nvvm_suld_2d_i16_trap:
4961 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4962 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4963 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4964 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4965 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4966 case Intrinsic::nvvm_suld_3d_i16_trap:
4967 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4968 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4969 case Intrinsic::nvvm_suld_1d_i16_zero:
4970 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4971 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4972 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4973 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4974 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4975 case Intrinsic::nvvm_suld_2d_i16_zero:
4976 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4977 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4978 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4979 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4980 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4981 case Intrinsic::nvvm_suld_3d_i16_zero:
4982 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4983 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4984 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4985 Info.memVT = MVT::i16;
4986 Info.ptrVal = nullptr;
4987 Info.offset = 0;
4989 Info.align = Align(16);
4990 return true;
4991
4992 case Intrinsic::nvvm_suld_1d_i32_clamp:
4993 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4994 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4995 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4996 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4997 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4998 case Intrinsic::nvvm_suld_2d_i32_clamp:
4999 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5000 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5001 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5002 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5003 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5004 case Intrinsic::nvvm_suld_3d_i32_clamp:
5005 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5006 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5007 case Intrinsic::nvvm_suld_1d_i32_trap:
5008 case Intrinsic::nvvm_suld_1d_v2i32_trap:
5009 case Intrinsic::nvvm_suld_1d_v4i32_trap:
5010 case Intrinsic::nvvm_suld_1d_array_i32_trap:
5011 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5012 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5013 case Intrinsic::nvvm_suld_2d_i32_trap:
5014 case Intrinsic::nvvm_suld_2d_v2i32_trap:
5015 case Intrinsic::nvvm_suld_2d_v4i32_trap:
5016 case Intrinsic::nvvm_suld_2d_array_i32_trap:
5017 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5018 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5019 case Intrinsic::nvvm_suld_3d_i32_trap:
5020 case Intrinsic::nvvm_suld_3d_v2i32_trap:
5021 case Intrinsic::nvvm_suld_3d_v4i32_trap:
5022 case Intrinsic::nvvm_suld_1d_i32_zero:
5023 case Intrinsic::nvvm_suld_1d_v2i32_zero:
5024 case Intrinsic::nvvm_suld_1d_v4i32_zero:
5025 case Intrinsic::nvvm_suld_1d_array_i32_zero:
5026 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5027 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5028 case Intrinsic::nvvm_suld_2d_i32_zero:
5029 case Intrinsic::nvvm_suld_2d_v2i32_zero:
5030 case Intrinsic::nvvm_suld_2d_v4i32_zero:
5031 case Intrinsic::nvvm_suld_2d_array_i32_zero:
5032 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5033 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5034 case Intrinsic::nvvm_suld_3d_i32_zero:
5035 case Intrinsic::nvvm_suld_3d_v2i32_zero:
5036 case Intrinsic::nvvm_suld_3d_v4i32_zero:
5037 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5038 Info.memVT = MVT::i32;
5039 Info.ptrVal = nullptr;
5040 Info.offset = 0;
5042 Info.align = Align(16);
5043 return true;
5044
5045 case Intrinsic::nvvm_suld_1d_i64_clamp:
5046 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5047 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5048 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5049 case Intrinsic::nvvm_suld_2d_i64_clamp:
5050 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5051 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5052 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5053 case Intrinsic::nvvm_suld_3d_i64_clamp:
5054 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5055 case Intrinsic::nvvm_suld_1d_i64_trap:
5056 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5057 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5058 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5059 case Intrinsic::nvvm_suld_2d_i64_trap:
5060 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5061 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5062 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5063 case Intrinsic::nvvm_suld_3d_i64_trap:
5064 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5065 case Intrinsic::nvvm_suld_1d_i64_zero:
5066 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5067 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5068 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5069 case Intrinsic::nvvm_suld_2d_i64_zero:
5070 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5071 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5072 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5073 case Intrinsic::nvvm_suld_3d_i64_zero:
5074 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5075 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5076 Info.memVT = MVT::i64;
5077 Info.ptrVal = nullptr;
5078 Info.offset = 0;
5080 Info.align = Align(16);
5081 return true;
5082 }
5083 return false;
5084}
5085
5086/// getFunctionParamOptimizedAlign - since function arguments are passed via
5087/// .param space, we may want to increase their alignment in a way that
5088/// ensures that we can effectively vectorize their loads & stores. We can
5089/// increase alignment only if the function has internal or has private
5090/// linkage as for other linkage types callers may already rely on default
5091/// alignment. To allow using 128-bit vectorized loads/stores, this function
5092/// ensures that alignment is 16 or greater.
5094 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5095 // Capping the alignment to 128 bytes as that is the maximum alignment
5096 // supported by PTX.
5097 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5098
5099 // If a function has linkage different from internal or private, we
5100 // must use default ABI alignment as external users rely on it. Same
5101 // for a function that may be called from a function pointer.
5102 if (!F || !F->hasLocalLinkage() ||
5103 F->hasAddressTaken(/*Users=*/nullptr,
5104 /*IgnoreCallbackUses=*/false,
5105 /*IgnoreAssumeLikeCalls=*/true,
5106 /*IgnoreLLVMUsed=*/true))
5107 return ABITypeAlign;
5108
5109 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5110 return std::max(Align(16), ABITypeAlign);
5111}
5112
5113/// Helper for computing alignment of a device function byval parameter.
5115 const Function *F, Type *ArgTy, Align InitialAlign,
5116 const DataLayout &DL) const {
5117 Align ArgAlign = InitialAlign;
5118 // Try to increase alignment to enhance vectorization options.
5119 if (F)
5120 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5121
5122 // Old ptx versions have a bug. When PTX code takes address of
5123 // byval parameter with alignment < 4, ptxas generates code to
5124 // spill argument into memory. Alas on sm_50+ ptxas generates
5125 // SASS code that fails with misaligned access. To work around
5126 // the problem, make sure that we align byval parameters by at
5127 // least 4. This bug seems to be fixed at least starting from
5128 // ptxas > 9.0.
5129 // TODO: remove this after verifying the bug is not reproduced
5130 // on non-deprecated ptxas versions.
5132 ArgAlign = std::max(ArgAlign, Align(4));
5133
5134 return ArgAlign;
5135}
5136
5137// Helper for getting a function parameter name. Name is composed from
5138// its index and the function name. Negative index corresponds to special
5139// parameter (unsized array) used for passing variable arguments.
5141 int Idx) const {
5142 std::string ParamName;
5143 raw_string_ostream ParamStr(ParamName);
5144
5145 ParamStr << getTargetMachine().getSymbol(F)->getName();
5146 if (Idx < 0)
5147 ParamStr << "_vararg";
5148 else
5149 ParamStr << "_param_" << Idx;
5150
5151 return ParamName;
5152}
5153
5154/// isLegalAddressingMode - Return true if the addressing mode represented
5155/// by AM is legal for this target, for a load/store of the specified type.
5156/// Used to guide target specific optimizations, like loop strength reduction
5157/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5158/// (CodeGenPrepare.cpp)
5160 const AddrMode &AM, Type *Ty,
5161 unsigned AS, Instruction *I) const {
5162 // AddrMode - This represents an addressing mode of:
5163 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5164 //
5165 // The legal address modes are
5166 // - [avar]
5167 // - [areg]
5168 // - [areg+immoff]
5169 // - [immAddr]
5170
5171 // immoff must fit in a signed 32-bit int
5172 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5173 return false;
5174
5175 if (AM.BaseGV)
5176 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5177
5178 switch (AM.Scale) {
5179 case 0: // "r", "r+i" or "i" is allowed
5180 break;
5181 case 1:
5182 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5183 return false;
5184 // Otherwise we have r+i.
5185 break;
5186 default:
5187 // No scale > 1 is allowed
5188 return false;
5189 }
5190 return true;
5191}
5192
5193//===----------------------------------------------------------------------===//
5194// NVPTX Inline Assembly Support
5195//===----------------------------------------------------------------------===//
5196
5197/// getConstraintType - Given a constraint letter, return the type of
5198/// constraint it is for this target.
5201 if (Constraint.size() == 1) {
5202 switch (Constraint[0]) {
5203 default:
5204 break;
5205 case 'b':
5206 case 'r':
5207 case 'h':
5208 case 'c':
5209 case 'l':
5210 case 'f':
5211 case 'd':
5212 case 'q':
5213 case '0':
5214 case 'N':
5215 return C_RegisterClass;
5216 }
5217 }
5218 return TargetLowering::getConstraintType(Constraint);
5219}
5220
5221std::pair<unsigned, const TargetRegisterClass *>
5223 StringRef Constraint,
5224 MVT VT) const {
5225 if (Constraint.size() == 1) {
5226 switch (Constraint[0]) {
5227 case 'b':
5228 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5229 case 'c':
5230 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5231 case 'h':
5232 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5233 case 'r':
5234 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5235 case 'l':
5236 case 'N':
5237 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5238 case 'q': {
5239 if (STI.getSmVersion() < 70)
5240 report_fatal_error("Inline asm with 128 bit operands is only "
5241 "supported for sm_70 and higher!");
5242 return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
5243 }
5244 case 'f':
5245 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5246 case 'd':
5247 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5248 }
5249 }
5250 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5251}
5252
5253//===----------------------------------------------------------------------===//
5254// NVPTX DAG Combining
5255//===----------------------------------------------------------------------===//
5256
5258 CodeGenOptLevel OptLevel) const {
5259 // Always honor command-line argument
5260 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5261 return FMAContractLevelOpt > 0;
5262
5263 // Do not contract if we're not optimizing the code.
5264 if (OptLevel == CodeGenOptLevel::None)
5265 return false;
5266
5267 // Honor TargetOptions flags that explicitly say fusion is okay.
5269 return true;
5270
5271 return allowUnsafeFPMath(MF);
5272}
5273
5275 // Honor TargetOptions flags that explicitly say unsafe math is okay.
5277 return true;
5278
5279 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5280 const Function &F = MF.getFunction();
5281 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5282}
5283
5284static bool isConstZero(const SDValue &Operand) {
5285 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5286 return Const && Const->getZExtValue() == 0;
5287}
5288
5289/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5290/// operands N0 and N1. This is a helper for PerformADDCombine that is
5291/// called with the default operands, and if that fails, with commuted
5292/// operands.
5293static SDValue
5296 EVT VT = N0.getValueType();
5297
5298 // Since integer multiply-add costs the same as integer multiply
5299 // but is more costly than integer add, do the fusion only when
5300 // the mul is only used in the add.
5301 // TODO: this may not be true for later architectures, consider relaxing this
5302 if (!N0.getNode()->hasOneUse())
5303 return SDValue();
5304
5305 // fold (add (mul a, b), c) -> (mad a, b, c)
5306 //
5307 if (N0.getOpcode() == ISD::MUL)
5308 return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
5309 N0.getOperand(1), N1);
5310
5311 // fold (add (select cond, 0, (mul a, b)), c)
5312 // -> (select cond, c, (mad a, b, c))
5313 //
5314 if (N0.getOpcode() == ISD::SELECT) {
5315 unsigned ZeroOpNum;
5316 if (isConstZero(N0->getOperand(1)))
5317 ZeroOpNum = 1;
5318 else if (isConstZero(N0->getOperand(2)))
5319 ZeroOpNum = 2;
5320 else
5321 return SDValue();
5322
5323 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5324 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5325 return SDValue();
5326
5327 SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5328 M->getOperand(0), M->getOperand(1), N1);
5329 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5330 ((ZeroOpNum == 1) ? N1 : MAD),
5331 ((ZeroOpNum == 1) ? MAD : N1));
5332 }
5333
5334 return SDValue();
5335}
5336
5337static SDValue
5340 CodeGenOptLevel OptLevel) {
5341 EVT VT = N0.getValueType();
5342 if (N0.getOpcode() == ISD::FMUL) {
5343 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5344 &DCI.DAG.getTargetLoweringInfo());
5345 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
5346 return SDValue();
5347
5348 // For floating point:
5349 // Do the fusion only when the mul has less than 5 uses and all
5350 // are add.
5351 // The heuristic is that if a use is not an add, then that use
5352 // cannot be fused into fma, therefore mul is still needed anyway.
5353 // If there are more than 4 uses, even if they are all add, fusing
5354 // them will increase register pressue.
5355 //
5356 int numUses = 0;
5357 int nonAddCount = 0;
5358 for (const SDNode *User : N0.getNode()->uses()) {
5359 numUses++;
5360 if (User->getOpcode() != ISD::FADD)
5361 ++nonAddCount;
5362 if (numUses >= 5)
5363 return SDValue();
5364 }
5365 if (nonAddCount) {
5366 int orderNo = N->getIROrder();
5367 int orderNo2 = N0.getNode()->getIROrder();
5368 // simple heuristics here for considering potential register
5369 // pressure, the logics here is that the differnce are used
5370 // to measure the distance between def and use, the longer distance
5371 // more likely cause register pressure.
5372 if (orderNo - orderNo2 < 500)
5373 return SDValue();
5374
5375 // Now, check if at least one of the FMUL's operands is live beyond the
5376 // node N, which guarantees that the FMA will not increase register
5377 // pressure at node N.
5378 bool opIsLive = false;
5379 const SDNode *left = N0.getOperand(0).getNode();
5380 const SDNode *right = N0.getOperand(1).getNode();
5381
5382 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5383 opIsLive = true;
5384
5385 if (!opIsLive)
5386 for (const SDNode *User : left->uses()) {
5387 int orderNo3 = User->getIROrder();
5388 if (orderNo3 > orderNo) {
5389 opIsLive = true;
5390 break;
5391 }
5392 }
5393
5394 if (!opIsLive)
5395 for (const SDNode *User : right->uses()) {
5396 int orderNo3 = User->getIROrder();
5397 if (orderNo3 > orderNo) {
5398 opIsLive = true;
5399 break;
5400 }
5401 }
5402
5403 if (!opIsLive)
5404 return SDValue();
5405 }
5406
5407 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5408 N0.getOperand(1), N1);
5409 }
5410
5411 return SDValue();
5412}
5413
5414static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
5415 std::size_t Back) {
5416 if (all_of(N->ops().drop_front(Front).drop_back(Back),
5417 [](const SDUse &U) { return U.get()->isUndef(); }))
5418 // Operand 0 is the previous value in the chain. Cannot return EntryToken
5419 // as the previous value will become unused and eliminated later.
5420 return N->getOperand(0);
5421
5422 return SDValue();
5423}
5424
5426 // Operands from the 3rd to the 2nd last one are the values to be stored.
5427 // {Chain, ArgID, Offset, Val, Glue}
5428 return PerformStoreCombineHelper(N, 3, 1);
5429}
5430
5432 // Operands from the 2nd to the last one are the values to be stored
5433 return PerformStoreCombineHelper(N, 2, 0);
5434}
5435
5436/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5437///
5440 CodeGenOptLevel OptLevel) {
5441 if (OptLevel == CodeGenOptLevel::None)
5442 return SDValue();
5443
5444 SDValue N0 = N->getOperand(0);
5445 SDValue N1 = N->getOperand(1);
5446
5447 // Skip non-integer, non-scalar case
5448 EVT VT = N0.getValueType();
5449 if (VT.isVector() || VT != MVT::i32)
5450 return SDValue();
5451
5452 // First try with the default operand order.
5453 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5454 return Result;
5455
5456 // If that didn't work, try again with the operands commuted.
5457 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5458}
5459
5460/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5461///
5464 CodeGenOptLevel OptLevel) {
5465 SDValue N0 = N->getOperand(0);
5466 SDValue N1 = N->getOperand(1);
5467
5468 EVT VT = N0.getValueType();
5469 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5470 return SDValue();
5471
5472 // First try with the default operand order.
5473 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5474 return Result;
5475
5476 // If that didn't work, try again with the operands commuted.
5477 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5478}
5479
5482 // The type legalizer turns a vector load of i8 values into a zextload to i16
5483 // registers, optionally ANY_EXTENDs it (if target type is integer),
5484 // and ANDs off the high 8 bits. Since we turn this load into a
5485 // target-specific DAG node, the DAG combiner fails to eliminate these AND
5486 // nodes. Do that here.
5487 SDValue Val = N->getOperand(0);
5488 SDValue Mask = N->getOperand(1);
5489
5490 if (isa<ConstantSDNode>(Val)) {
5491 std::swap(Val, Mask);
5492 }
5493
5494 SDValue AExt;
5495
5496 // Convert BFE-> truncate i16 -> and 255
5497 // To just BFE-> truncate i16, as the value already has all the bits in the
5498 // right places.
5499 if (Val.getOpcode() == ISD::TRUNCATE) {
5500 SDValue BFE = Val.getOperand(0);
5501 if (BFE.getOpcode() != NVPTXISD::BFE)
5502 return SDValue();
5503
5504 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5505 if (!BFEBits)
5506 return SDValue();
5507 uint64_t BFEBitsVal = BFEBits->getZExtValue();
5508
5509 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5510 if (!MaskCnst) {
5511 // Not an AND with a constant
5512 return SDValue();
5513 }
5514 uint64_t MaskVal = MaskCnst->getZExtValue();
5515
5516 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5517 return SDValue();
5518 // If we get here, the AND is unnecessary. Just replace it with the trunc
5519 DCI.CombineTo(N, Val, false);
5520 }
5521 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5522 if (Val.getOpcode() == ISD::ANY_EXTEND) {
5523 AExt = Val;
5524 Val = Val->getOperand(0);
5525 }
5526
5527 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5528 Val = Val->getOperand(0);
5529 }
5530
5531 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5532 Val->getOpcode() == NVPTXISD::LoadV4) {
5533 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5534 if (!MaskCnst) {
5535 // Not an AND with a constant
5536 return SDValue();
5537 }
5538
5539 uint64_t MaskVal = MaskCnst->getZExtValue();
5540 if (MaskVal != 0xff) {
5541 // Not an AND that chops off top 8 bits
5542 return SDValue();
5543 }
5544
5545 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5546 if (!Mem) {
5547 // Not a MemSDNode?!?
5548 return SDValue();
5549 }
5550
5551 EVT MemVT = Mem->getMemoryVT();
5552 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5553 // We only handle the i8 case
5554 return SDValue();
5555 }
5556
5557 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5558 if (ExtType == ISD::SEXTLOAD) {
5559 // If for some reason the load is a sextload, the and is needed to zero
5560 // out the high 8 bits
5561 return SDValue();
5562 }
5563
5564 bool AddTo = false;
5565 if (AExt.getNode() != nullptr) {
5566 // Re-insert the ext as a zext.
5567 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5568 AExt.getValueType(), Val);
5569 AddTo = true;
5570 }
5571
5572 // If we get here, the AND is unnecessary. Just replace it with the load
5573 DCI.CombineTo(N, Val, AddTo);
5574 }
5575
5576 return SDValue();
5577}
5578
5581 CodeGenOptLevel OptLevel) {
5582 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5583
5584 // Don't do anything at less than -O2.
5585 if (OptLevel < CodeGenOptLevel::Default)
5586 return SDValue();
5587
5588 SelectionDAG &DAG = DCI.DAG;
5589 SDLoc DL(N);
5590 EVT VT = N->getValueType(0);
5591 bool IsSigned = N->getOpcode() == ISD::SREM;
5592 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5593
5594 const SDValue &Num = N->getOperand(0);
5595 const SDValue &Den = N->getOperand(1);
5596
5597 for (const SDNode *U : Num->uses()) {
5598 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5599 U->getOperand(1) == Den) {
5600 // Num % Den -> Num - (Num / Den) * Den
5601 return DAG.getNode(ISD::SUB, DL, VT, Num,
5602 DAG.getNode(ISD::MUL, DL, VT,
5603 DAG.getNode(DivOpc, DL, VT, Num, Den),
5604 Den));
5605 }
5606 }
5607 return SDValue();
5608}
5609
5613 Unknown
5615
5616/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5617/// that can be demoted to \p OptSize bits without loss of information. The
5618/// signedness of the operand, if determinable, is placed in \p S.
5620 unsigned OptSize,
5621 OperandSignedness &S) {
5622 S = Unknown;
5623
5624 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5625 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5626 EVT OrigVT = Op.getOperand(0).getValueType();
5627 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5628 S = Signed;
5629 return true;
5630 }
5631 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5632 EVT OrigVT = Op.getOperand(0).getValueType();
5633 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5634 S = Unsigned;
5635 return true;
5636 }
5637 }
5638
5639 return false;
5640}
5641
5642/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5643/// be demoted to \p OptSize bits without loss of information. If the operands
5644/// contain a constant, it should appear as the RHS operand. The signedness of
5645/// the operands is placed in \p IsSigned.
5647 unsigned OptSize,
5648 bool &IsSigned) {
5649 OperandSignedness LHSSign;
5650
5651 // The LHS operand must be a demotable op
5652 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5653 return false;
5654
5655 // We should have been able to determine the signedness from the LHS
5656 if (LHSSign == Unknown)
5657 return false;
5658
5659 IsSigned = (LHSSign == Signed);
5660
5661 // The RHS can be a demotable op or a constant
5662 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5663 const APInt &Val = CI->getAPIntValue();
5664 if (LHSSign == Unsigned) {
5665 return Val.isIntN(OptSize);
5666 } else {
5667 return Val.isSignedIntN(OptSize);
5668 }
5669 } else {
5670 OperandSignedness RHSSign;
5671 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5672 return false;
5673
5674 return LHSSign == RHSSign;
5675 }
5676}
5677
5678/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5679/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5680/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5681/// amount.
5684 EVT MulType = N->getValueType(0);
5685 if (MulType != MVT::i32 && MulType != MVT::i64) {
5686 return SDValue();
5687 }
5688
5689 SDLoc DL(N);
5690 unsigned OptSize = MulType.getSizeInBits() >> 1;
5691 SDValue LHS = N->getOperand(0);
5692 SDValue RHS = N->getOperand(1);
5693
5694 // Canonicalize the multiply so the constant (if any) is on the right
5695 if (N->getOpcode() == ISD::MUL) {
5696 if (isa<ConstantSDNode>(LHS)) {
5697 std::swap(LHS, RHS);
5698 }
5699 }
5700
5701 // If we have a SHL, determine the actual multiply amount
5702 if (N->getOpcode() == ISD::SHL) {
5703 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5704 if (!ShlRHS) {
5705 return SDValue();
5706 }
5707
5708 APInt ShiftAmt = ShlRHS->getAPIntValue();
5709 unsigned BitWidth = MulType.getSizeInBits();
5710 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5711 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5712 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5713 } else {
5714 return SDValue();
5715 }
5716 }
5717
5718 bool Signed;
5719 // Verify that our operands are demotable
5720 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5721 return SDValue();
5722 }
5723
5724 EVT DemotedVT;
5725 if (MulType == MVT::i32) {
5726 DemotedVT = MVT::i16;
5727 } else {
5728 DemotedVT = MVT::i32;
5729 }
5730
5731 // Truncate the operands to the correct size. Note that these are just for
5732 // type consistency and will (likely) be eliminated in later phases.
5733 SDValue TruncLHS =
5734 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5735 SDValue TruncRHS =
5736 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5737
5738 unsigned Opc;
5739 if (Signed) {
5741 } else {
5743 }
5744
5745 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5746}
5747
5748static bool isConstOne(const SDValue &Operand) {
5749 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5750 return Const && Const->getZExtValue() == 1;
5751}
5752
5754 if (Add->getOpcode() != ISD::ADD)
5755 return SDValue();
5756
5757 if (isConstOne(Add->getOperand(0)))
5758 return Add->getOperand(1);
5759
5760 if (isConstOne(Add->getOperand(1)))
5761 return Add->getOperand(0);
5762
5763 return SDValue();
5764}
5765
5768
5770 return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
5771
5772 return SDValue();
5773}
5774
5776 SDLoc DL,
5778 if (Select->getOpcode() != ISD::SELECT)
5779 return SDValue();
5780
5781 SDValue Cond = Select->getOperand(0);
5782
5783 unsigned ConstOpNo;
5784 if (isConstOne(Select->getOperand(1)))
5785 ConstOpNo = 1;
5786 else if (isConstOne(Select->getOperand(2)))
5787 ConstOpNo = 2;
5788 else
5789 return SDValue();
5790
5791 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5792
5793 // Do not combine if the resulting sequence is not obviously profitable.
5795 return SDValue();
5796
5797 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5798
5799 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5800 (ConstOpNo == 1) ? X : NewMul,
5801 (ConstOpNo == 1) ? NewMul : X);
5802}
5803
5804static SDValue
5807
5808 EVT VT = N0.getValueType();
5809 if (VT.isVector())
5810 return SDValue();
5811
5812 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5813 return SDValue();
5814
5815 SDLoc DL(N);
5816
5817 // (mul x, (add y, 1)) -> (mad x, y, x)
5818 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5819 return Res;
5820 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5821 return Res;
5822
5823 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5824 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5825 return Res;
5826 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5827 return Res;
5828
5829 return SDValue();
5830}
5831
5832/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5835 CodeGenOptLevel OptLevel) {
5836 if (OptLevel == CodeGenOptLevel::None)
5837 return SDValue();
5838
5839 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5840 return Ret;
5841
5842 SDValue N0 = N->getOperand(0);
5843 SDValue N1 = N->getOperand(1);
5844 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5845}
5846
5847/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5850 CodeGenOptLevel OptLevel) {
5851 if (OptLevel > CodeGenOptLevel::None) {
5852 // Try mul.wide combining at OptLevel > 0
5853 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5854 return Ret;
5855 }
5856
5857 return SDValue();
5858}
5859
5862 unsigned int SmVersion) {
5863 EVT CCType = N->getValueType(0);
5864 SDValue A = N->getOperand(0);
5865 SDValue B = N->getOperand(1);
5866
5867 EVT AType = A.getValueType();
5868 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5869 return SDValue();
5870
5871 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5872 return SDValue();
5873
5874 SDLoc DL(N);
5875 // setp.f16x2 returns two scalar predicates, which we need to
5876 // convert back to v2i1. The returned result will be scalarized by
5877 // the legalizer, but the comparison will remain a single vector
5878 // instruction.
5879 SDValue CCNode = DCI.DAG.getNode(
5880 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5882 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5883 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5884 CCNode.getValue(1));
5885}
5886
5889 SDValue Vector = N->getOperand(0);
5890 SDLoc DL(N);
5891 EVT VectorVT = Vector.getValueType();
5892 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5893 IsPTXVectorType(VectorVT.getSimpleVT()))
5894 return SDValue(); // Native vector loads already combine nicely w/
5895 // extract_vector_elt.
5896 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5897 // handle them OK.
5898 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5899 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5900 return SDValue();
5901
5902 // Don't mess with undef values as sra may be simplified to 0, not undef.
5903 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5904 return SDValue();
5905
5906 uint64_t VectorBits = VectorVT.getSizeInBits();
5907 // We only handle the types we can extract in-register.
5908 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5909 return SDValue();
5910
5911 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5912 // Index == 0 is handled by generic DAG combiner.
5913 if (!Index || Index->getZExtValue() == 0)
5914 return SDValue();
5915
5916 MVT IVT = MVT::getIntegerVT(VectorBits);
5917 EVT EltVT = VectorVT.getVectorElementType();
5918 EVT EltIVT = EltVT.changeTypeToInteger();
5919 uint64_t EltBits = EltVT.getScalarSizeInBits();
5920
5921 SDValue Result = DCI.DAG.getNode(
5922 ISD::TRUNCATE, DL, EltIVT,
5923 DCI.DAG.getNode(
5924 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5925 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5926
5927 // If element has non-integer type, bitcast it back to the expected type.
5928 if (EltVT != EltIVT)
5929 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5930 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5931 if (EltVT != N->getValueType(0))
5932 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5933
5934 return Result;
5935}
5936
5939 SDValue VA = N->getOperand(1);
5940 EVT VectorVT = VA.getValueType();
5941 if (VectorVT != MVT::v4i8)
5942 return SDValue();
5943
5944 // We need to split vselect into individual per-element operations Because we
5945 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5946 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5947 // to/from i16 normally used for i8 values.
5949 SDLoc DL(N);
5950 SDValue VCond = N->getOperand(0);
5951 SDValue VB = N->getOperand(2);
5952 for (int I = 0; I < 4; ++I) {
5953 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5954 DCI.DAG.getConstant(I, DL, MVT::i32));
5955 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5956 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5957 DCI.DAG.getConstant(I, DL, MVT::i32)),
5958 DL, MVT::i32);
5959 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5960 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5961 DCI.DAG.getConstant(I, DL, MVT::i32)),
5962 DL, MVT::i32);
5964 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5965 }
5966 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5967}
5968
5971 SelectionDAG &DAG = DCI.DAG;
5972 LoadSDNode *LD = cast<LoadSDNode>(N);
5973
5974 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5975 // letting ReplaceLoadVector split it into smaller loads during legalization.
5976 // This is done at dag-combine1 time, so that vector operations with i8
5977 // elements can be optimised away instead of being needlessly split during
5978 // legalization, which involves storing to the stack and loading it back.
5979 EVT VT = N->getValueType(0);
5980 if (VT != MVT::v16i8)
5981 return SDValue();
5982
5983 SDLoc DL(N);
5984
5985 // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5986 unsigned Opc = NVPTXISD::LoadV4;
5987 EVT NewVT = MVT::v4i32;
5988 EVT EltVT = NewVT.getVectorElementType();
5989 unsigned NumElts = NewVT.getVectorNumElements();
5990 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5991 SDVTList RetVTList = DAG.getVTList(RetVTs);
5992 SmallVector<SDValue, 8> Ops(N->ops());
5993 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5994 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5995 LD->getMemOperand());
5996 SDValue NewChain = NewLoad.getValue(NumElts);
5997
5998 // Create a vector of the same type returned by the original load.
6000 for (unsigned i = 0; i < NumElts; i++)
6001 Elts.push_back(NewLoad.getValue(i));
6002 return DCI.DAG.getMergeValues(
6003 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
6004 NewChain},
6005 DL);
6006}
6007
6008SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6009 DAGCombinerInfo &DCI) const {
6011 switch (N->getOpcode()) {
6012 default: break;
6013 case ISD::ADD:
6014 return PerformADDCombine(N, DCI, OptLevel);
6015 case ISD::FADD:
6016 return PerformFADDCombine(N, DCI, OptLevel);
6017 case ISD::MUL:
6018 return PerformMULCombine(N, DCI, OptLevel);
6019 case ISD::SHL:
6020 return PerformSHLCombine(N, DCI, OptLevel);
6021 case ISD::AND:
6022 return PerformANDCombine(N, DCI);
6023 case ISD::UREM:
6024 case ISD::SREM:
6025 return PerformREMCombine(N, DCI, OptLevel);
6026 case ISD::SETCC:
6027 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6028 case ISD::LOAD:
6029 return PerformLOADCombine(N, DCI);
6039 return PerformEXTRACTCombine(N, DCI);
6040 case ISD::VSELECT:
6041 return PerformVSELECTCombine(N, DCI);
6042 }
6043 return SDValue();
6044}
6045
6046/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
6049 EVT ResVT = N->getValueType(0);
6050 SDLoc DL(N);
6051
6052 assert(ResVT.isVector() && "Vector load must have vector type");
6053
6054 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
6055 // legal. We can (and should) split that into 2 loads of <2 x double> here
6056 // but I'm leaving that as a TODO for now.
6057 assert(ResVT.isSimple() && "Can only handle simple types");
6058 switch (ResVT.getSimpleVT().SimpleTy) {
6059 default:
6060 return;
6061 case MVT::v2i8:
6062 case MVT::v2i16:
6063 case MVT::v2i32:
6064 case MVT::v2i64:
6065 case MVT::v2f16:
6066 case MVT::v2f32:
6067 case MVT::v2f64:
6068 case MVT::v4i8:
6069 case MVT::v4i16:
6070 case MVT::v4i32:
6071 case MVT::v4f16:
6072 case MVT::v4f32:
6073 case MVT::v8f16: // <4 x f16x2>
6074 case MVT::v8bf16: // <4 x bf16x2>
6075 case MVT::v8i16: // <4 x i16x2>
6076 // This is a "native" vector type
6077 break;
6078 }
6079
6080 LoadSDNode *LD = cast<LoadSDNode>(N);
6081
6082 Align Alignment = LD->getAlign();
6083 auto &TD = DAG.getDataLayout();
6084 Align PrefAlign =
6085 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
6086 if (Alignment < PrefAlign) {
6087 // This load is not sufficiently aligned, so bail out and let this vector
6088 // load be scalarized. Note that we may still be able to emit smaller
6089 // vector loads. For example, if we are loading a <4 x float> with an
6090 // alignment of 8, this check will fail but the legalizer will try again
6091 // with 2 x <2 x float>, which will succeed with an alignment of 8.
6092 return;
6093 }
6094
6095 EVT EltVT = ResVT.getVectorElementType();
6096 unsigned NumElts = ResVT.getVectorNumElements();
6097
6098 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
6099 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6100 // loaded type to i16 and propagate the "real" type as the memory type.
6101 bool NeedTrunc = false;
6102 if (EltVT.getSizeInBits() < 16) {
6103 EltVT = MVT::i16;
6104 NeedTrunc = true;
6105 }
6106
6107 unsigned Opcode = 0;
6108 SDVTList LdResVTs;
6109 bool Load16x2 = false;
6110
6111 switch (NumElts) {
6112 default:
6113 return;
6114 case 2:
6115 Opcode = NVPTXISD::LoadV2;
6116 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6117 break;
6118 case 4: {
6119 Opcode = NVPTXISD::LoadV4;
6120 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6121 LdResVTs = DAG.getVTList(ListVTs);
6122 break;
6123 }
6124 case 8: {
6125 // v8f16 is a special case. PTX doesn't have ld.v8.f16
6126 // instruction. Instead, we split the vector into v2f16 chunks and
6127 // load them with ld.v4.b32.
6128 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
6129 Load16x2 = true;
6130 Opcode = NVPTXISD::LoadV4;
6131 EVT VVT;
6132 switch (EltVT.getSimpleVT().SimpleTy) {
6133 case MVT::f16:
6134 VVT = MVT::v2f16;
6135 break;
6136 case MVT::bf16:
6137 VVT = MVT::v2bf16;
6138 break;
6139 case MVT::i16:
6140 VVT = MVT::v2i16;
6141 break;
6142 default:
6143 llvm_unreachable("Unsupported v8 vector type.");
6144 }
6145 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
6146 LdResVTs = DAG.getVTList(ListVTs);
6147 break;
6148 }
6149 }
6150
6151 // Copy regular operands
6152 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
6153
6154 // The select routine does not have access to the LoadSDNode instance, so
6155 // pass along the extension information
6156 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
6157
6158 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6159 LD->getMemoryVT(),
6160 LD->getMemOperand());
6161
6162 SmallVector<SDValue, 8> ScalarRes;
6163 if (Load16x2) {
6164 // Split v2f16 subvectors back into individual elements.
6165 NumElts /= 2;
6166 for (unsigned i = 0; i < NumElts; ++i) {
6167 SDValue SubVector = NewLD.getValue(i);
6168 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6169 DAG.getIntPtrConstant(0, DL));
6170 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6171 DAG.getIntPtrConstant(1, DL));
6172 ScalarRes.push_back(E0);
6173 ScalarRes.push_back(E1);
6174 }
6175 } else {
6176 for (unsigned i = 0; i < NumElts; ++i) {
6177 SDValue Res = NewLD.getValue(i);
6178 if (NeedTrunc)
6179 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6180 ScalarRes.push_back(Res);
6181 }
6182 }
6183
6184 SDValue LoadChain = NewLD.getValue(NumElts);
6185
6186 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
6187
6188 Results.push_back(BuildVec);
6189 Results.push_back(LoadChain);
6190}
6191
6194 SDValue Chain = N->getOperand(0);
6195 SDValue Intrin = N->getOperand(1);
6196 SDLoc DL(N);
6197
6198 // Get the intrinsic ID
6199 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6200 switch (IntrinNo) {
6201 default:
6202 return;
6203 case Intrinsic::nvvm_ldg_global_i:
6204 case Intrinsic::nvvm_ldg_global_f:
6205 case Intrinsic::nvvm_ldg_global_p:
6206 case Intrinsic::nvvm_ldu_global_i:
6207 case Intrinsic::nvvm_ldu_global_f:
6208 case Intrinsic::nvvm_ldu_global_p: {
6209 EVT ResVT = N->getValueType(0);
6210
6211 if (ResVT.isVector()) {
6212 // Vector LDG/LDU
6213
6214 unsigned NumElts = ResVT.getVectorNumElements();
6215 EVT EltVT = ResVT.getVectorElementType();
6216
6217 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6218 // legalization.
6219 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6220 // loaded type to i16 and propagate the "real" type as the memory type.
6221 bool NeedTrunc = false;
6222 if (EltVT.getSizeInBits() < 16) {
6223 EltVT = MVT::i16;
6224 NeedTrunc = true;
6225 }
6226
6227 unsigned Opcode = 0;
6228 SDVTList LdResVTs;
6229
6230 switch (NumElts) {
6231 default:
6232 return;
6233 case 2:
6234 switch (IntrinNo) {
6235 default:
6236 return;
6237 case Intrinsic::nvvm_ldg_global_i:
6238 case Intrinsic::nvvm_ldg_global_f:
6239 case Intrinsic::nvvm_ldg_global_p:
6240 Opcode = NVPTXISD::LDGV2;
6241 break;
6242 case Intrinsic::nvvm_ldu_global_i:
6243 case Intrinsic::nvvm_ldu_global_f:
6244 case Intrinsic::nvvm_ldu_global_p:
6245 Opcode = NVPTXISD::LDUV2;
6246 break;
6247 }
6248 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6249 break;
6250 case 4: {
6251 switch (IntrinNo) {
6252 default:
6253 return;
6254 case Intrinsic::nvvm_ldg_global_i:
6255 case Intrinsic::nvvm_ldg_global_f:
6256 case Intrinsic::nvvm_ldg_global_p:
6257 Opcode = NVPTXISD::LDGV4;
6258 break;
6259 case Intrinsic::nvvm_ldu_global_i:
6260 case Intrinsic::nvvm_ldu_global_f:
6261 case Intrinsic::nvvm_ldu_global_p:
6262 Opcode = NVPTXISD::LDUV4;
6263 break;
6264 }
6265 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6266 LdResVTs = DAG.getVTList(ListVTs);
6267 break;
6268 }
6269 }
6270
6271 SmallVector<SDValue, 8> OtherOps;
6272
6273 // Copy regular operands
6274
6275 OtherOps.push_back(Chain); // Chain
6276 // Skip operand 1 (intrinsic ID)
6277 // Others
6278 OtherOps.append(N->op_begin() + 2, N->op_end());
6279
6280 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6281
6282 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6283 MemSD->getMemoryVT(),
6284 MemSD->getMemOperand());
6285
6286 SmallVector<SDValue, 4> ScalarRes;
6287
6288 for (unsigned i = 0; i < NumElts; ++i) {
6289 SDValue Res = NewLD.getValue(i);
6290 if (NeedTrunc)
6291 Res =
6292 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6293 ScalarRes.push_back(Res);
6294 }
6295
6296 SDValue LoadChain = NewLD.getValue(NumElts);
6297
6298 SDValue BuildVec =
6299 DAG.getBuildVector(ResVT, DL, ScalarRes);
6300
6301 Results.push_back(BuildVec);
6302 Results.push_back(LoadChain);
6303 } else {
6304 // i8 LDG/LDU
6305 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6306 "Custom handling of non-i8 ldu/ldg?");
6307
6308 // Just copy all operands as-is
6309 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
6310
6311 // Force output to i16
6312 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6313
6314 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6315
6316 // We make sure the memory type is i8, which will be used during isel
6317 // to select the proper instruction.
6318 SDValue NewLD =
6319 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6320 MVT::i8, MemSD->getMemOperand());
6321
6322 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6323 NewLD.getValue(0)));
6324 Results.push_back(NewLD.getValue(1));
6325 }
6326 }
6327 }
6328}
6329
6332 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6333 // result so that it can pass the legalization
6334 SDLoc DL(N);
6335 SDValue Chain = N->getOperand(0);
6336 SDValue Reg = N->getOperand(1);
6337 SDValue Glue = N->getOperand(2);
6338
6339 assert(Reg.getValueType() == MVT::i128 &&
6340 "Custom lowering for CopyFromReg with 128-bit reg only");
6341 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6342 N->getValueType(2)};
6343 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6344
6345 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6346 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6347 {NewValue.getValue(0), NewValue.getValue(1)});
6348
6349 Results.push_back(Pair);
6350 Results.push_back(NewValue.getValue(2));
6351 Results.push_back(NewValue.getValue(3));
6352}
6353
6354void NVPTXTargetLowering::ReplaceNodeResults(
6356 switch (N->getOpcode()) {
6357 default:
6358 report_fatal_error("Unhandled custom legalization");
6359 case ISD::LOAD:
6361 return;
6364 return;
6365 case ISD::CopyFromReg:
6367 return;
6368 }
6369}
6370
6373 Type *Ty = AI->getValOperand()->getType();
6374
6375 if (AI->isFloatingPointOperation()) {
6377 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6378 STI.getPTXVersion() >= 63)
6380 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6381 STI.getPTXVersion() >= 78)
6383 if (Ty->isFloatTy())
6385 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6387 }
6389 }
6390
6391 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6392 auto ITy = cast<llvm::IntegerType>(Ty);
6393
6394 switch (AI->getOperation()) {
6395 default:
6401 switch (ITy->getBitWidth()) {
6402 case 8:
6403 case 16:
6405 case 32:
6407 case 64:
6408 if (STI.hasAtomBitwise64())
6411 default:
6412 llvm_unreachable("unsupported width encountered");
6413 }
6420 switch (ITy->getBitWidth()) {
6421 case 8:
6422 case 16:
6424 case 32:
6426 case 64:
6427 if (STI.hasAtomMinMax64())
6430 default:
6431 llvm_unreachable("unsupported width encountered");
6432 }
6433 }
6434
6436}
6437
6438// Pin NVPTXTargetObjectFile's vtables to this file.
6440
6442 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6443 return getDataSection();
6444}
#define MAKE_CASE(V)
static const LLT F32
amdgpu AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file contains the declarations of entities that describe floating point environment and related ...
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
Module.h This file contains the declarations for the Module class.
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
static SDValue PerformStoreParamCombine(SDNode *N)
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static unsigned getOpcForSurfaceInstr(unsigned Intrinsic)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static bool Is16bitsType(MVT VT)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static bool IsTypePassedAsArray(const Type *Ty)
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static unsigned getOpcForTextureInstr(unsigned Intrinsic)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformStoreRetvalCombine(SDNode *N)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)
static bool adjustElementType(EVT &ElementType)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue matchMADConstOnePattern(SDValue Add)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
ParamVectorizationFlags
@ PVF_FIRST
@ PVF_SCALAR
@ PVF_INNER
@ PVF_LAST
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
OperandSignedness
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
static std::atomic< unsigned > GlobalUniqueCallSite
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
unsigned SmVersion
Definition: NVVMReflect.cpp:81
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:415
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1110
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:412
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1217
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ And
*p = old & v
Definition: Instructions.h:716
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
bool isFloatingPointOperation() const
Definition: Instructions.h:864
BinOp getOperation() const
Definition: Instructions.h:787
Value * getValOperand()
Definition: Instructions.h:856
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Definition: Attributes.h:805
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
This class represents a function call, abstracting a target machine's calling convention.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:629
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:212
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:36
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
Machine Value Type.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
static auto fixedlen_vector_valuetypes()
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getPTXVersion() const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool hasBF16Math() const
bool allowFP16Math() const
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
bool useF32FTZ(const MachineFunction &MF) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
iterator_range< use_iterator > uses()
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition: SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:737
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:568
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:494
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:843
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:488
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:691
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:483
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVMContext * getContext() const
Definition: SelectionDAG.h:501
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
Class to represent struct types.
Definition: DerivedTypes.h:216
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
@ VoidTyID
type with no size
Definition: Type.h:63
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
StringRef save(const char *S)
Definition: StringSaver.h:52
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1198
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1074
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:820
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:933
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1231
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:960
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1120
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1095
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1099
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1194
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:771
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1008
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition: ISDOpcodes.h:366
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1084
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:828
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:918
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:765
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:952
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1027
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:866
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1225
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1251
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:899
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1189
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:793
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
@ Bitcast
Perform the operation on a different, but equivalently sized type.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
static bool isIndirectCall(const MachineInstr &MI)
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool Isv2x16VT(EVT VT)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
@ ADDRESS_SPACE_PARAM
Definition: NVPTXBaseInfo.h:29
MaybeAlign getAlign(const Function &F, unsigned Index)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned promoteScalarArgumentSize(unsigned size)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:276
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)