LLVM 22.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
18#include "NVPTXSubtarget.h"
19#include "NVPTXTargetMachine.h"
21#include "NVPTXUtilities.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Argument.h"
39#include "llvm/IR/Attributes.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DataLayout.h"
44#include "llvm/IR/FPEnv.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/IRBuilder.h"
48#include "llvm/IR/Instruction.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
51#include "llvm/IR/Module.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
65#include <algorithm>
66#include <cassert>
67#include <cmath>
68#include <cstdint>
69#include <iterator>
70#include <optional>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76#define DEBUG_TYPE "nvptx-lower"
77
78using namespace llvm;
79
81 "nvptx-sched4reg",
82 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
83
85 "nvptx-fma-level", cl::Hidden,
86 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
87 " 1: do it 2: do it aggressively"),
88 cl::init(2));
89
91 "nvptx-prec-divf32", cl::Hidden,
93 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
95 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
96 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
98 "Use IEEE Compliant F32 div.rnd if available (default)"),
100 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
102
104 "nvptx-prec-sqrtf32", cl::Hidden,
105 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
106 cl::init(true));
107
108/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
109/// does NOT use lg2.approx for log2, so this is disabled by default.
111 "nvptx-approx-log2f32",
112 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
113 cl::init(false));
114
116 "nvptx-force-min-byval-param-align", cl::Hidden,
117 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
118 " params of device functions."),
119 cl::init(false));
120
123 const SDNode &N) const {
124 // If nvptx-prec-div32=N is used on the command-line, always honor it
125 if (UsePrecDivF32.getNumOccurrences() > 0)
126 return UsePrecDivF32;
127
128 const SDNodeFlags Flags = N.getFlags();
129 if (Flags.hasApproximateFuncs())
131
133}
134
136 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
137 if (UsePrecSqrtF32.getNumOccurrences() > 0)
138 return UsePrecSqrtF32;
139
140 if (N) {
141 const SDNodeFlags Flags = N->getFlags();
142 if (Flags.hasApproximateFuncs())
143 return false;
144 }
145
146 return true;
147}
148
153
154static bool IsPTXVectorType(MVT VT) {
155 switch (VT.SimpleTy) {
156 default:
157 return false;
158 case MVT::v2i1:
159 case MVT::v4i1:
160 case MVT::v2i8:
161 case MVT::v4i8:
162 case MVT::v8i8: // <2 x i8x4>
163 case MVT::v16i8: // <4 x i8x4>
164 case MVT::v2i16:
165 case MVT::v4i16:
166 case MVT::v8i16: // <4 x i16x2>
167 case MVT::v2i32:
168 case MVT::v4i32:
169 case MVT::v2i64:
170 case MVT::v2f16:
171 case MVT::v4f16:
172 case MVT::v8f16: // <4 x f16x2>
173 case MVT::v2bf16:
174 case MVT::v4bf16:
175 case MVT::v8bf16: // <4 x bf16x2>
176 case MVT::v2f32:
177 case MVT::v4f32:
178 case MVT::v2f64:
179 case MVT::v4i64:
180 case MVT::v4f64:
181 case MVT::v8i32:
182 case MVT::v8f32:
183 case MVT::v16f16: // <8 x f16x2>
184 case MVT::v16bf16: // <8 x bf16x2>
185 case MVT::v16i16: // <8 x i16x2>
186 case MVT::v32i8: // <8 x i8x4>
187 return true;
188 }
189}
190
191// When legalizing vector loads/stores, this function is called, which does two
192// things:
193// 1. Determines Whether the vector is something we want to custom lower,
194// std::nullopt is returned if we do not want to custom lower it.
195// 2. If we do want to handle it, returns two parameters:
196// - unsigned int NumElts - The number of elements in the final vector
197// - EVT EltVT - The type of the elements in the final vector
198static std::optional<std::pair<unsigned int, MVT>>
200 unsigned AddressSpace) {
201 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
202
203 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
204 VectorEVT.getSizeInBits() == 256)
205 return {{4, MVT::i64}};
206
207 if (!VectorEVT.isSimple())
208 return std::nullopt;
209 const MVT VectorVT = VectorEVT.getSimpleVT();
210
211 if (!VectorVT.isVector()) {
212 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
213 return {{2, MVT::i64}};
214 return std::nullopt;
215 }
216
217 const MVT EltVT = VectorVT.getVectorElementType();
218 const unsigned NumElts = VectorVT.getVectorNumElements();
219
220 // The size of the PTX virtual register that holds a packed type.
221 unsigned PackRegSize;
222
223 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
224 // legal. We can (and should) split that into 2 stores of <2 x double> here
225 // but I'm leaving that as a TODO for now.
226 switch (VectorVT.SimpleTy) {
227 default:
228 return std::nullopt;
229 case MVT::v4i64:
230 case MVT::v4f64:
231 case MVT::v8i32:
232 // This is a "native" vector type iff the address space is global
233 // and the target supports 256-bit loads/stores
234 if (!CanLowerTo256Bit)
235 return std::nullopt;
237 case MVT::v2i8:
238 case MVT::v2i32:
239 case MVT::v2i64:
240 case MVT::v2f64:
241 case MVT::v4i32:
242 // This is a "native" vector type
243 return std::pair(NumElts, EltVT);
244 case MVT::v16f16: // <8 x f16x2>
245 case MVT::v16bf16: // <8 x bf16x2>
246 case MVT::v16i16: // <8 x i16x2>
247 case MVT::v32i8: // <8 x i8x4>
248 // This can be upsized into a "native" vector type iff the address space is
249 // global and the target supports 256-bit loads/stores.
250 if (!CanLowerTo256Bit)
251 return std::nullopt;
253 case MVT::v2i16: // <1 x i16x2>
254 case MVT::v2f16: // <1 x f16x2>
255 case MVT::v2bf16: // <1 x bf16x2>
256 case MVT::v4i8: // <1 x i8x4>
257 case MVT::v4i16: // <2 x i16x2>
258 case MVT::v4f16: // <2 x f16x2>
259 case MVT::v4bf16: // <2 x bf16x2>
260 case MVT::v8i8: // <2 x i8x4>
261 case MVT::v8f16: // <4 x f16x2>
262 case MVT::v8bf16: // <4 x bf16x2>
263 case MVT::v8i16: // <4 x i16x2>
264 case MVT::v16i8: // <4 x i8x4>
265 PackRegSize = 32;
266 break;
267 case MVT::v8f32: // <4 x f32x2>
268 if (!CanLowerTo256Bit)
269 return std::nullopt;
271 case MVT::v2f32: // <1 x f32x2>
272 case MVT::v4f32: // <2 x f32x2>
273 if (!STI.hasF32x2Instructions())
274 return std::pair(NumElts, EltVT);
275 PackRegSize = 64;
276 break;
277 }
278
279 // If we reach here, then we can pack 2 or more elements into a single 32-bit
280 // or 64-bit PTX register and treat the vector as a new vector containing
281 // packed elements.
282
283 // Number of elements to pack in one word.
284 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
285
286 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
287}
288
289/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
290/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
291/// the types as required by the calling convention (with special handling for
292/// i8s).
293/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
294/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
295/// LowerCall, and LowerReturn.
296static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
297 LLVMContext &Ctx, CallingConv::ID CallConv,
298 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
300 uint64_t StartingOffset = 0) {
301 SmallVector<EVT, 16> TempVTs;
302 SmallVector<uint64_t, 16> TempOffsets;
303 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
304
305 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
306 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
307 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
308
309 // Since we actually can load/store b8, we need to ensure that we'll use
310 // the original sized type for any i8s or i8 vectors.
311 if (VT.getScalarType() == MVT::i8) {
312 if (RegisterVT == MVT::i16)
313 RegisterVT = MVT::i8;
314 else if (RegisterVT == MVT::v2i16)
315 RegisterVT = MVT::v2i8;
316 else
317 assert(RegisterVT == MVT::v4i8 &&
318 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
319 }
320
321 // TODO: This is horribly incorrect for cases where the vector elements are
322 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
323 // has existed for as long as NVPTX has and no one has complained, so we'll
324 // leave it for now.
325 for (unsigned I : seq(NumRegs)) {
326 ValueVTs.push_back(RegisterVT);
327 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
328 }
329 }
330}
331
332// We return an EVT that can hold N VTs
333// If the VT is a vector, the resulting EVT is a flat vector with the same
334// element type as VT's element type.
335static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
336 if (N == 1)
337 return VT;
338
339 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
340 VT.getVectorNumElements() * N)
341 : EVT::getVectorVT(C, VT, N);
342}
343
345 const SDLoc &dl, SelectionDAG &DAG) {
346 if (V.getValueType() == VT) {
347 assert(I == 0 && "Index must be 0 for scalar value");
348 return V;
349 }
350
351 if (!VT.isVector())
352 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
353 DAG.getVectorIdxConstant(I, dl));
354
355 return DAG.getNode(
356 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
358}
359
360template <typename T>
361static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
362 SelectionDAG &DAG, T GetElement) {
363 if (N == 1)
364 return GetElement(0);
365
367 for (const unsigned I : llvm::seq(N)) {
368 SDValue Val = GetElement(I);
369 if (Val.getValueType().isVector())
370 DAG.ExtractVectorElements(Val, Values);
371 else
372 Values.push_back(Val);
373 }
374
375 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
376 Values.size());
377 return DAG.getBuildVector(VT, dl, Values);
378}
379
380/// PromoteScalarIntegerPTX
381/// Used to make sure the arguments/returns are suitable for passing
382/// and promote them to a larger size if they're not.
383///
384/// The promoted type is placed in \p PromoteVT if the function returns true.
386 if (VT.isScalarInteger()) {
387 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
388 default:
390 "Promotion is not suitable for scalars of size larger than 64-bits");
391 case 1:
392 return MVT::i1;
393 case 2:
394 case 4:
395 case 8:
396 return MVT::i8;
397 case 16:
398 return MVT::i16;
399 case 32:
400 return MVT::i32;
401 case 64:
402 return MVT::i64;
403 }
404 }
405 return VT;
406}
407
408// Check whether we can merge loads/stores of some of the pieces of a
409// flattened function parameter or return value into a single vector
410// load/store.
411//
412// The flattened parameter is represented as a list of EVTs and
413// offsets, and the whole structure is aligned to ParamAlignment. This
414// function determines whether we can load/store pieces of the
415// parameter starting at index Idx using a single vectorized op of
416// size AccessSize. If so, it returns the number of param pieces
417// covered by the vector op. Otherwise, it returns 1.
418template <typename T>
420 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
421 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
422
423 // Can't vectorize if param alignment is not sufficient.
424 if (ParamAlignment < AccessSize)
425 return 1;
426 // Can't vectorize if offset is not aligned.
427 if (Offsets[Idx] & (AccessSize - 1))
428 return 1;
429
430 EVT EltVT = ValueVTs[Idx];
431 unsigned EltSize = EltVT.getStoreSize();
432
433 // Element is too large to vectorize.
434 if (EltSize >= AccessSize)
435 return 1;
436
437 unsigned NumElts = AccessSize / EltSize;
438 // Can't vectorize if AccessBytes if not a multiple of EltSize.
439 if (AccessSize != EltSize * NumElts)
440 return 1;
441
442 // We don't have enough elements to vectorize.
443 if (Idx + NumElts > ValueVTs.size())
444 return 1;
445
446 // PTX ISA can only deal with 2- and 4-element vector ops.
447 if (NumElts != 4 && NumElts != 2)
448 return 1;
449
450 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
451 // Types do not match.
452 if (ValueVTs[j] != EltVT)
453 return 1;
454
455 // Elements are not contiguous.
456 if (Offsets[j] - Offsets[j - 1] != EltSize)
457 return 1;
458 }
459 // OK. We can vectorize ValueVTs[i..i+NumElts)
460 return NumElts;
461}
462
463// Computes whether and how we can vectorize the loads/stores of a
464// flattened function parameter or return value.
465//
466// The flattened parameter is represented as the list of ValueVTs and
467// Offsets, and is aligned to ParamAlignment bytes. We return a vector
468// of the same size as ValueVTs indicating how each piece should be
469// loaded/stored (i.e. as a scalar, or as part of a vector
470// load/store).
471template <typename T>
474 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
475 bool IsVAArg = false) {
476 // Set vector size to match ValueVTs and mark all elements as
477 // scalars by default.
478
479 if (IsVAArg)
480 return SmallVector<unsigned>(ValueVTs.size(), 1);
481
482 SmallVector<unsigned, 16> VectorInfo;
483
484 const auto GetNumElts = [&](unsigned I) -> unsigned {
485 for (const unsigned AccessSize : {16, 8, 4, 2}) {
486 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
487 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
488 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
489 "Unexpected vectorization size");
490 if (NumElts != 1)
491 return NumElts;
492 }
493 return 1;
494 };
495
496 // Check what we can vectorize using 128/64/32-bit accesses.
497 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
498 const unsigned NumElts = GetNumElts(I);
499 VectorInfo.push_back(NumElts);
500 I += NumElts;
501 }
502 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
503 ValueVTs.size());
504 return VectorInfo;
505}
506
507// NVPTXTargetLowering Constructor.
509 const NVPTXSubtarget &STI)
510 : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
511 // always lower memset, memcpy, and memmove intrinsics to load/store
512 // instructions, rather
513 // then generating calls to memset, mempcy or memmove.
517
520
521 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
522 // condition branches.
523 setJumpIsExpensive(true);
524
525 // Wide divides are _very_ slow. Try to reduce the width of the divide if
526 // possible.
527 addBypassSlowDiv(64, 32);
528
529 // By default, use the Source scheduling
530 if (sched4reg)
532 else
534
535 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
536 LegalizeAction NoF16Action) {
537 bool IsOpSupported = STI.allowFP16Math();
538 switch (Op) {
539 // Several FP16 instructions are available on sm_80 only.
540 case ISD::FMINNUM:
541 case ISD::FMAXNUM:
542 case ISD::FMAXNUM_IEEE:
543 case ISD::FMINNUM_IEEE:
544 case ISD::FMAXIMUM:
545 case ISD::FMINIMUM:
546 case ISD::FMAXIMUMNUM:
547 case ISD::FMINIMUMNUM:
548 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
549 break;
550 case ISD::FEXP2:
551 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
552 break;
553 }
554 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
555 };
556
557 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
558 LegalizeAction NoBF16Action) {
559 bool IsOpSupported = STI.hasNativeBF16Support(Op);
561 Op, VT, IsOpSupported ? Action : NoBF16Action);
562 };
563
564 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
565 LegalizeAction NoI16x2Action) {
566 bool IsOpSupported = false;
567 // instructions are available on sm_90 only
568 switch (Op) {
569 case ISD::ADD:
570 case ISD::SMAX:
571 case ISD::SMIN:
572 case ISD::UMIN:
573 case ISD::UMAX:
574 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
575 break;
576 }
577 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
578 };
579
580 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
581 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
582 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
583 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
584 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
585 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
586 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
587 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
588 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
589 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
591 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
592
593 if (STI.hasF32x2Instructions())
594 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
595
596 // Conversion to/from FP16/FP16x2 is always legal.
601
602 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
603 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
604 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
605
606 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
607 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
608
609 // Conversion to/from BFP16/BFP16x2 is always legal.
614
615 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
616 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
617 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
618 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
619
620 // Conversion to/from i16/i16x2 is always legal.
625
630
631 // No support for these operations with v2f32.
634 // Need custom lowering in case the index is dynamic.
635 if (STI.hasF32x2Instructions())
637
638 // Custom conversions to/from v2i8.
639 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
640
641 // Only logical ops can be done on v4i8 directly, others must be done
642 // elementwise.
659 MVT::v4i8, Expand);
660
661 // Operations not directly supported by NVPTX.
662 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
663 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
664 MVT::v4i8, MVT::i32, MVT::i64}) {
666 setOperationAction(ISD::BR_CC, VT, Expand);
667 }
668
669 // Not directly supported. TLI would attempt to expand operations like
670 // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes.
672
673 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
674 // For others we will expand to a SHL/SRA pair.
681
688
691
693 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
694 Expand);
695
696 if (STI.hasHWROT32()) {
699 Custom);
700 }
701
703
704 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
705 setOperationAction(ISD::BRIND, MVT::Other, Expand);
706
707 // We want to legalize constant related memmove and memcopy
708 // intrinsics.
710
711 // FP extload/truncstore is not legal in PTX. We need to expand all these.
712 for (auto FloatVTs :
714 for (MVT ValVT : FloatVTs) {
715 for (MVT MemVT : FloatVTs) {
716 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
717 setTruncStoreAction(ValVT, MemVT, Expand);
718 }
719 }
720 }
721
722 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
723 // how they'll be lowered in ISel anyway, and by doing this a little earlier
724 // we allow for more DAG combine opportunities.
725 for (auto IntVTs :
727 for (MVT ValVT : IntVTs)
728 for (MVT MemVT : IntVTs)
729 if (isTypeLegal(ValVT))
730 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
731
732 // PTX does not support load / store predicate registers
733 setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
734 for (MVT VT : MVT::integer_valuetypes()) {
736 Promote);
737 setTruncStoreAction(VT, MVT::i1, Expand);
738 }
739
740 // Disable generations of extload/truncstore for v2i16/v2i8. The generic
741 // expansion for these nodes when they are unaligned is incorrect if the
742 // type is a vector.
743 //
744 // TODO: Fix the generic expansion for these nodes found in
745 // TargetLowering::expandUnalignedLoad/Store.
747 MVT::v2i8, Expand);
748 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
749
750 // Register custom handling for illegal type loads/stores. We'll try to custom
751 // lower almost all illegal types and logic in the lowering will discard cases
752 // we can't handle.
753 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
755 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
756 setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
757
758 // Custom legalization for LDU intrinsics.
759 // TODO: The logic to lower these is not very robust and we should rewrite it.
760 // Perhaps LDU should not be represented as an intrinsic at all.
763 if (IsPTXVectorType(VT))
765
769 MVT::i1, Expand);
770
771 // This is legal in NVPTX
776
777 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
778 setOperationAction({ISD::STACKRESTORE, ISD::STACKSAVE}, MVT::Other, Custom);
779
780 // TRAP can be lowered to PTX trap
781 setOperationAction(ISD::TRAP, MVT::Other, Legal);
782 // DEBUGTRAP can be lowered to PTX brkpt
783 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
784
785 // Support varargs.
786 setOperationAction(ISD::VASTART, MVT::Other, Custom);
787 setOperationAction(ISD::VAARG, MVT::Other, Custom);
788 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
789 setOperationAction(ISD::VAEND, MVT::Other, Expand);
790
792 {MVT::i16, MVT::i32, MVT::i64}, Legal);
793
795 Promote);
798
799 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
800 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
801 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
802 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
803 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
804 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
805 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
806
807 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
808 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
809 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
810 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
811 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
812 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
813
814 // Other arithmetic and logic ops are unsupported.
818 MVT::v2i16, Expand);
819
824 if (STI.getPTXVersion() >= 43) {
829 }
830
832 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
835
836 // PTX does not directly support SELP of i1, so promote to i32 first
838
839 // PTX cannot multiply two i64s in a single instruction.
842
843 // We have some custom DAG combine patterns for these nodes
846 ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM,
847 ISD::FMAXIMUM, ISD::FMINIMUM, ISD::FMAXIMUMNUM,
848 ISD::FMINIMUMNUM, ISD::MUL, ISD::SHL,
850 ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
851 ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
852
853 // setcc for f16x2 and bf16x2 needs special handling to prevent
854 // legalizer's attempt to scalarize it due to v2i1 not being legal.
855 if (STI.allowFP16Math() || STI.hasBF16Math())
857
858 // Vector reduction operations. These may be turned into shuffle or tree
859 // reductions depending on what instructions are available for each type.
861 MVT EltVT = VT.getVectorElementType();
862 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
863 setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
864 ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
865 VT, Custom);
866 }
867 }
868
869 // Promote fp16 arithmetic if fp16 hardware isn't available or the
870 // user passed --nvptx-no-fp16-math. The flag is useful because,
871 // although sm_53+ GPUs have some sort of FP16 support in
872 // hardware, only sm_53 and sm_60 have full implementation. Others
873 // only have token amount of hardware and are likely to run faster
874 // by using fp32 units instead.
875 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
876 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
877 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
878 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
879 // bf16 must be promoted to f32.
880 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
881 if (getOperationAction(Op, MVT::bf16) == Promote)
882 AddPromotedToType(Op, MVT::bf16, MVT::f32);
883 setOperationAction(Op, MVT::v2f32,
884 STI.hasF32x2Instructions() ? Legal : Expand);
885 }
886
887 // On SM80, we select add/mul/sub as fma to avoid promotion to float
888 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
889 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
890 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
892 }
893 }
894 }
895
896 // f16/f16x2 neg was introduced in PTX 60, SM_53.
897 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
898 STI.getPTXVersion() >= 60 &&
899 STI.allowFP16Math();
900 for (const auto &VT : {MVT::f16, MVT::v2f16})
901 setOperationAction(ISD::FNEG, VT,
902 IsFP16FP16x2NegAvailable ? Legal : Expand);
903
904 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
905 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
906 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
907 // (would be) Library functions.
908
909 // These map to conversion instructions for scalar FP types.
910 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
911 ISD::FROUNDEVEN, ISD::FTRUNC}) {
912 setOperationAction(Op, MVT::f16, Legal);
913 setOperationAction(Op, MVT::f32, Legal);
914 setOperationAction(Op, MVT::f64, Legal);
915 setOperationAction(Op, MVT::v2f16, Expand);
916 setOperationAction(Op, MVT::v2bf16, Expand);
917 setOperationAction(Op, MVT::v2f32, Expand);
918 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
919 if (getOperationAction(Op, MVT::bf16) == Promote)
920 AddPromotedToType(Op, MVT::bf16, MVT::f32);
921 }
922
923 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
924 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
925 }
926 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
927 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
928 setOperationAction(ISD::FP_EXTEND, VT, Custom);
930 }
931 }
932
933 // Expand v2f32 = fp_extend
934 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
935 // Expand v2[b]f16 = fp_round v2f32
936 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
937
938 // sm_80 only has conversions between f32 and bf16. Custom lower all other
939 // bf16 conversions.
940 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
941 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
944 VT, Custom);
945 }
948 MVT::bf16, Custom);
949 }
950
951 setOperationAction(ISD::FROUND, MVT::f16, Promote);
952 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
953 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
954 setOperationAction(ISD::FROUND, MVT::f32, Custom);
955 setOperationAction(ISD::FROUND, MVT::f64, Custom);
956 setOperationAction(ISD::FROUND, MVT::bf16, Promote);
957 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
958
959 // 'Expand' implements FCOPYSIGN without calling an external library.
966
967 // These map to corresponding instructions for f32/f64. f16 must be
968 // promoted to f32. v2f16 is expanded to f16, which is then promoted
969 // to f32.
970 for (const auto &Op :
971 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
972 setOperationAction(Op, MVT::f16, Promote);
973 setOperationAction(Op, MVT::f32, Legal);
974 // only div/rem/sqrt are legal for f64
975 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
976 setOperationAction(Op, MVT::f64, Legal);
977 }
978 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
979 setOperationAction(Op, MVT::bf16, Promote);
980 AddPromotedToType(Op, MVT::bf16, MVT::f32);
981 }
982 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
983
984 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
985 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
986 if (STI.getPTXVersion() >= 65) {
987 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
988 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
989 } else {
990 setOperationAction(ISD::FABS, MVT::f16, Promote);
991 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
992 }
993 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
994 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
995 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
996 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
997
998 for (const auto &Op :
999 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
1000 setOperationAction(Op, MVT::f32, Legal);
1001 setOperationAction(Op, MVT::f64, Legal);
1002 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1003 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1004 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1005 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1006 if (getOperationAction(Op, MVT::bf16) == Promote)
1007 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1008 setOperationAction(Op, MVT::v2f32, Expand);
1009 }
1010 bool SupportsF32MinMaxNaN =
1011 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1012 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1013 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1014 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1015 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1016 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1017 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1018 setOperationAction(Op, MVT::v2f32, Expand);
1019 }
1020
1021 // Custom lowering for inline asm with 128-bit operands
1024
1025 // FEXP2 support:
1026 // - f32
1027 // - f16/f16x2 (sm_70+, PTX 7.0+)
1028 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1029 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1030 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
1031 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1032 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1033 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1034 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1035 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1036
1037 // FLOG2 supports f32 only
1038 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1039 if (UseApproxLog2F32) {
1040 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
1041 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1042 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1043 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1044 Expand);
1045 }
1046
1047 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1048
1049 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1050
1051 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1052 // type, we need to custom lower it.
1053 setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
1054 Custom);
1055
1056 // Now deduce the information based on the above mentioned
1057 // actions
1058 computeRegisterProperties(STI.getRegisterInfo());
1059
1060 // PTX support for 16-bit CAS is emulated. Only use 32+
1061 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1062 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1064
1065 // Custom lowering for tcgen05.ld vector operands
1067 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1068 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1069 Custom);
1070
1071 // Custom lowering for tcgen05.st vector operands
1073 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1074 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1075 Custom);
1076
1077 // Enable custom lowering for the following:
1078 // * MVT::i128 - clusterlaunchcontrol
1079 // * MVT::i32 - prmt
1080 // * MVT::Other - internal.addrspace.wrap
1081 setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
1082 Custom);
1083}
1084
1085const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1086
1087#define MAKE_CASE(V) \
1088 case V: \
1089 return #V;
1090
1091 switch ((NVPTXISD::NodeType)Opcode) {
1093 break;
1094
1137 }
1138 return nullptr;
1139
1140#undef MAKE_CASE
1141}
1142
1145 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1146 VT.getScalarType() == MVT::i1)
1147 return TypeSplitVector;
1149}
1150
1152 int Enabled, int &ExtraSteps,
1153 bool &UseOneConst,
1154 bool Reciprocal) const {
1157 return SDValue();
1158
1159 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1160 ExtraSteps = 0;
1161
1162 SDLoc DL(Operand);
1163 EVT VT = Operand.getValueType();
1164 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1165
1166 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1167 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1168 DAG.getConstant(IID, DL, MVT::i32), Operand);
1169 };
1170
1171 // The sqrt and rsqrt refinement processes assume we always start out with an
1172 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1173 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1174 // any refinement, we must return a regular sqrt.
1175 if (Reciprocal || ExtraSteps > 0) {
1176 if (VT == MVT::f32)
1177 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1178 : Intrinsic::nvvm_rsqrt_approx_f);
1179 else if (VT == MVT::f64)
1180 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1181 else
1182 return SDValue();
1183 } else {
1184 if (VT == MVT::f32)
1185 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1186 : Intrinsic::nvvm_sqrt_approx_f);
1187 else {
1188 // There's no sqrt.approx.f64 instruction, so we emit
1189 // reciprocal(rsqrt(x)). This is faster than
1190 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1191 // x * rsqrt(x).)
1192 return DAG.getNode(
1194 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1195 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1196 }
1197 }
1198}
1199
1201 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1203 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1204 unsigned UniqueCallSite) const {
1205 auto PtrVT = getPointerTy(DL);
1206
1207 std::string Prototype;
1208 raw_string_ostream O(Prototype);
1209 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1210
1211 if (RetTy->isVoidTy()) {
1212 O << "()";
1213 } else {
1214 O << "(";
1215 if (shouldPassAsArray(RetTy)) {
1216 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1217 O << ".param .align " << RetAlign.value() << " .b8 _["
1218 << DL.getTypeAllocSize(RetTy) << "]";
1219 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1220 unsigned size = 0;
1221 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1222 size = ITy->getBitWidth();
1223 } else {
1224 assert(RetTy->isFloatingPointTy() &&
1225 "Floating point type expected here");
1226 size = RetTy->getPrimitiveSizeInBits();
1227 }
1228 // PTX ABI requires all scalar return values to be at least 32
1229 // bits in size. fp16 normally uses .b16 as its storage type in
1230 // PTX, so its size must be adjusted here, too.
1232
1233 O << ".param .b" << size << " _";
1234 } else if (isa<PointerType>(RetTy)) {
1235 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1236 } else {
1237 llvm_unreachable("Unknown return type");
1238 }
1239 O << ") ";
1240 }
1241 O << "_ (";
1242
1243 bool first = true;
1244
1245 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1246 auto AllOuts = ArrayRef(Outs);
1247 for (const unsigned I : llvm::seq(NumArgs)) {
1248 const auto ArgOuts =
1249 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1250 AllOuts = AllOuts.drop_front(ArgOuts.size());
1251
1252 Type *Ty = Args[I].Ty;
1253 if (!first) {
1254 O << ", ";
1255 }
1256 first = false;
1257
1258 if (ArgOuts[0].Flags.isByVal()) {
1259 // Indirect calls need strict ABI alignment so we disable optimizations by
1260 // not providing a function to optimize.
1261 Type *ETy = Args[I].IndirectType;
1262 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1263 Align ParamByValAlign =
1264 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1265
1266 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1267 << ArgOuts[0].Flags.getByValSize() << "]";
1268 } else {
1269 if (shouldPassAsArray(Ty)) {
1270 Align ParamAlign =
1271 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1272 O << ".param .align " << ParamAlign.value() << " .b8 _["
1273 << DL.getTypeAllocSize(Ty) << "]";
1274 continue;
1275 }
1276 // i8 types in IR will be i16 types in SDAG
1277 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1278 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1279 "type mismatch between callee prototype and arguments");
1280 // scalar type
1281 unsigned sz = 0;
1282 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1283 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1284 } else if (isa<PointerType>(Ty)) {
1285 sz = PtrVT.getSizeInBits();
1286 } else {
1287 sz = Ty->getPrimitiveSizeInBits();
1288 }
1289 O << ".param .b" << sz << " _";
1290 }
1291 }
1292
1293 if (FirstVAArg)
1294 O << (first ? "" : ",") << " .param .align "
1295 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1296 O << ")";
1297 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1298 O << " .noreturn";
1299 O << ";";
1300
1301 return Prototype;
1302}
1303
1305 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1306 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1307}
1308
1309Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1310 unsigned Idx,
1311 const DataLayout &DL) const {
1312 if (!CB) {
1313 // CallSite is zero, fallback to ABI type alignment
1314 return DL.getABITypeAlign(Ty);
1315 }
1316
1317 const Function *DirectCallee = CB->getCalledFunction();
1318
1319 if (!DirectCallee) {
1320 // We don't have a direct function symbol, but that may be because of
1321 // constant cast instructions in the call.
1322
1323 // With bitcast'd call targets, the instruction will be the call
1324 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1325 // Check if we have call alignment metadata
1326 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1327 return StackAlign.value();
1328 }
1329 DirectCallee = getMaybeBitcastedCallee(CB);
1330 }
1331
1332 // Check for function alignment information if we found that the
1333 // ultimate target is a Function
1334 if (DirectCallee)
1335 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1336
1337 // Call is indirect, fall back to the ABI type alignment
1338 return DL.getABITypeAlign(Ty);
1339}
1340
1342 const GlobalAddressSDNode *Func) {
1343 if (!Func)
1344 return false;
1345 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1346 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1347 return false;
1348}
1349
1351 const DataLayout &DL,
1352 const TargetLowering &TL) {
1353 if (Ptr->getOpcode() == ISD::FrameIndex) {
1354 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1357
1359 }
1360
1361 // Peel of an addrspacecast to generic and load directly from the specific
1362 // address space.
1363 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1364 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1365 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1366 Ptr = ASC->getOperand(0);
1367 return MachinePointerInfo(ASC->getSrcAddressSpace());
1368 }
1369 }
1370
1371 return MachinePointerInfo();
1372}
1373
1375 if (Flags.isSExt())
1376 return ISD::SIGN_EXTEND;
1377 if (Flags.isZExt())
1378 return ISD::ZERO_EXTEND;
1379 return ISD::ANY_EXTEND;
1380}
1381
1383 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1384 SDLoc dl) {
1385 const EVT ActualVT = V.getValueType();
1386 assert((ActualVT == ExpectedVT ||
1387 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1388 "Non-integer argument type size mismatch");
1389 if (ExpectedVT.bitsGT(ActualVT))
1390 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1391 if (ExpectedVT.bitsLT(ActualVT))
1392 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1393
1394 return V;
1395}
1396
1398 SmallVectorImpl<SDValue> &InVals) const {
1399
1400 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1402 "Support for variadic functions (unsized array parameter) introduced "
1403 "in PTX ISA version 6.0 and requires target sm_30.");
1404
1405 SelectionDAG &DAG = CLI.DAG;
1406 SDLoc dl = CLI.DL;
1407 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1408 SDValue Callee = CLI.Callee;
1409 ArgListTy &Args = CLI.getArgs();
1410 Type *RetTy = CLI.RetTy;
1411 const CallBase *CB = CLI.CB;
1412 const DataLayout &DL = DAG.getDataLayout();
1413 LLVMContext &Ctx = *DAG.getContext();
1414
1415 const auto GetI32 = [&](const unsigned I) {
1416 return DAG.getConstant(I, dl, MVT::i32);
1417 };
1418
1419 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1420 const SDValue CallChain = CLI.Chain;
1421 const SDValue StartChain =
1422 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1423 SDValue DeclareGlue = StartChain.getValue(1);
1424
1425 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1426
1427 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1428 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1429 // loaded/stored using i16, so it's handled here as well.
1430 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1431 SDValue Declare =
1432 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1433 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1434 CallPrereqs.push_back(Declare);
1435 DeclareGlue = Declare.getValue(1);
1436 return Declare;
1437 };
1438
1439 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1440 unsigned Size) {
1441 SDValue Declare = DAG.getNode(
1442 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1443 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1444 CallPrereqs.push_back(Declare);
1445 DeclareGlue = Declare.getValue(1);
1446 return Declare;
1447 };
1448
1449 // Variadic arguments.
1450 //
1451 // Normally, for each argument, we declare a param scalar or a param
1452 // byte array in the .param space, and store the argument value to that
1453 // param scalar or array starting at offset 0.
1454 //
1455 // In the case of the first variadic argument, we declare a vararg byte array
1456 // with size 0. The exact size of this array isn't known at this point, so
1457 // it'll be patched later. All the variadic arguments will be stored to this
1458 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1459 // initially set to 0, so it can be used for non-variadic arguments (which use
1460 // 0 offset) to simplify the code.
1461 //
1462 // After all vararg is processed, 'VAOffset' holds the size of the
1463 // vararg byte array.
1464 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1465 "Non-VarArg function with extra arguments");
1466
1467 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1468 unsigned VAOffset = 0; // current offset in the param array
1469
1470 const SDValue VADeclareParam =
1471 CLI.Args.size() > FirstVAArg
1472 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1473 Align(STI.getMaxRequiredAlignment()), 0)
1474 : SDValue();
1475
1476 // Args.size() and Outs.size() need not match.
1477 // Outs.size() will be larger
1478 // * if there is an aggregate argument with multiple fields (each field
1479 // showing up separately in Outs)
1480 // * if there is a vector argument with more than typical vector-length
1481 // elements (generally if more than 4) where each vector element is
1482 // individually present in Outs.
1483 // So a different index should be used for indexing into Outs/OutVals.
1484 // See similar issue in LowerFormalArguments.
1485 auto AllOuts = ArrayRef(CLI.Outs);
1486 auto AllOutVals = ArrayRef(CLI.OutVals);
1487 assert(AllOuts.size() == AllOutVals.size() &&
1488 "Outs and OutVals must be the same size");
1489 // Declare the .params or .reg need to pass values
1490 // to the function
1491 for (const auto E : llvm::enumerate(Args)) {
1492 const auto ArgI = E.index();
1493 const auto Arg = E.value();
1494 const auto ArgOuts =
1495 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1496 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1497 AllOuts = AllOuts.drop_front(ArgOuts.size());
1498 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1499
1500 const bool IsVAArg = (ArgI >= FirstVAArg);
1501 const bool IsByVal = Arg.IsByVal;
1502
1503 const SDValue ParamSymbol =
1504 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1505
1506 assert((!IsByVal || Arg.IndirectType) &&
1507 "byval arg must have indirect type");
1508 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1509
1510 const Align ArgAlign = [&]() {
1511 if (IsByVal) {
1512 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1513 // so we don't need to worry whether it's naturally aligned or not.
1514 // See TargetLowering::LowerCallTo().
1515 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1517 InitialAlign, DL);
1518 }
1519 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1520 }();
1521
1522 const unsigned TySize = DL.getTypeAllocSize(ETy);
1523 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1524 "type size mismatch");
1525
1526 const SDValue ArgDeclare = [&]() {
1527 if (IsVAArg)
1528 return VADeclareParam;
1529
1530 if (IsByVal || shouldPassAsArray(Arg.Ty))
1531 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1532
1533 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1534 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1535 "Only int and float types are supported as non-array arguments");
1536
1537 return MakeDeclareScalarParam(ParamSymbol, TySize);
1538 }();
1539
1540 if (IsByVal) {
1541 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1542 SDValue SrcPtr = ArgOutVals[0];
1543 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1544 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1545
1546 if (IsVAArg)
1547 VAOffset = alignTo(VAOffset, ArgAlign);
1548
1549 SmallVector<EVT, 4> ValueVTs, MemVTs;
1551 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1552
1553 unsigned J = 0;
1554 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1555 for (const unsigned NumElts : VI) {
1556 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1557 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1558 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1559 SDValue SrcLoad =
1560 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1561
1562 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1563 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1564 SDValue ParamAddr =
1565 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1566 SDValue StoreParam =
1567 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1569 CallPrereqs.push_back(StoreParam);
1570
1571 J += NumElts;
1572 }
1573 if (IsVAArg)
1574 VAOffset += TySize;
1575 } else {
1578 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1579 VAOffset);
1580 assert(VTs.size() == Offsets.size() && "Size mismatch");
1581 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1582
1583 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1584 // than 32-bits are sign extended or zero extended, depending on
1585 // whether they are signed or unsigned types. This case applies
1586 // only to scalar parameters and not to aggregate values.
1587 const bool ExtendIntegerParam =
1588 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1589
1590 const auto GetStoredValue = [&](const unsigned I) {
1591 SDValue StVal = ArgOutVals[I];
1593 StVal.getValueType() &&
1594 "OutVal type should always be legal");
1595
1596 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1597 const EVT StoreVT =
1598 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1599
1600 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1601 };
1602
1603 unsigned J = 0;
1604 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1605 for (const unsigned NumElts : VI) {
1606 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1607
1608 unsigned Offset;
1609 if (IsVAArg) {
1610 // TODO: We may need to support vector types that can be passed
1611 // as scalars in variadic arguments.
1612 assert(NumElts == 1 &&
1613 "Vectorization should be disabled for vaargs.");
1614
1615 // Align each part of the variadic argument to their type.
1616 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1617 Offset = VAOffset;
1618
1619 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1620 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1621 } else {
1622 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1623 Offset = Offsets[J];
1624 }
1625
1626 SDValue Ptr =
1627 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1628
1629 const MaybeAlign CurrentAlign = ExtendIntegerParam
1630 ? MaybeAlign(std::nullopt)
1631 : commonAlignment(ArgAlign, Offset);
1632
1633 SDValue Val =
1634 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1635 return GetStoredValue(J + K);
1636 });
1637
1638 SDValue StoreParam =
1639 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1641 CallPrereqs.push_back(StoreParam);
1642
1643 J += NumElts;
1644 }
1645 }
1646 }
1647
1648 // Handle Result
1649 if (!Ins.empty()) {
1650 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1651 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1652 if (shouldPassAsArray(RetTy)) {
1653 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1654 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1655 } else {
1656 MakeDeclareScalarParam(RetSymbol, ResultSize);
1657 }
1658 }
1659
1660 // Set the size of the vararg param byte array if the callee is a variadic
1661 // function and the variadic part is not empty.
1662 if (VADeclareParam) {
1663 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1664 VADeclareParam.getOperand(1),
1665 VADeclareParam.getOperand(2), GetI32(VAOffset),
1666 VADeclareParam.getOperand(4)};
1667 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1668 VADeclareParam->getVTList(), DeclareParamOps);
1669 }
1670
1671 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1672 // If the type of the callsite does not match that of the function, convert
1673 // the callsite to an indirect call.
1674 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1675
1676 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1677 // between them we must rely on the call site value which is valid for
1678 // indirect calls but is always null for libcalls.
1679 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1680
1681 if (isa<ExternalSymbolSDNode>(Callee)) {
1682 Function* CalleeFunc = nullptr;
1683
1684 // Try to find the callee in the current module.
1685 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1686 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1687
1688 // Set the "libcall callee" attribute to indicate that the function
1689 // must always have a declaration.
1690 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1691 }
1692
1693 if (IsIndirectCall) {
1694 // This is indirect function call case : PTX requires a prototype of the
1695 // form
1696 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1697 // to be emitted, and the label has to used as the last arg of call
1698 // instruction.
1699 // The prototype is embedded in a string and put as the operand for a
1700 // CallPrototype SDNode which will print out to the value of the string.
1701 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1702 std::string Proto =
1703 getPrototype(DL, RetTy, Args, CLI.Outs,
1704 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1705 UniqueCallSite);
1706 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1707 const SDValue PrototypeDeclare = DAG.getNode(
1708 NVPTXISD::CallPrototype, dl, MVT::Other,
1709 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1710 CallPrereqs.push_back(PrototypeDeclare);
1711 }
1712
1713 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1714 const unsigned NumArgs =
1715 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1716 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1717 /// NumParams, Callee, Proto)
1718 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1719 const SDValue Call = DAG.getNode(
1720 NVPTXISD::CALL, dl, MVT::Other,
1721 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1722 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1723
1724 SmallVector<SDValue, 16> LoadChains{Call};
1725 SmallVector<SDValue, 16> ProxyRegOps;
1726 if (!Ins.empty()) {
1729 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1730 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1731
1732 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1733 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1734
1735 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1736 // 32-bits are sign extended or zero extended, depending on whether
1737 // they are signed or unsigned types.
1738 const bool ExtendIntegerRetVal =
1739 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1740
1741 unsigned I = 0;
1742 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1743 for (const unsigned NumElts : VI) {
1744 const MaybeAlign CurrentAlign =
1745 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1746 : commonAlignment(RetAlign, Offsets[I]);
1747
1748 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1749 const EVT LoadVT =
1750 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1751 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1752 SDValue Ptr =
1753 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1754
1755 SDValue R =
1756 DAG.getLoad(VecVT, dl, Call, Ptr,
1758
1759 LoadChains.push_back(R.getValue(1));
1760 for (const unsigned J : llvm::seq(NumElts))
1761 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1762 I += NumElts;
1763 }
1764 }
1765
1766 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1767 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1768 UniqueCallSite + 1, SDValue(), dl);
1769
1770 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1771 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1772 // dangling.
1773 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1774 SDValue Proxy =
1775 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1776 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1777 InVals.push_back(Ret);
1778 }
1779
1780 // set IsTailCall to false for now, until we figure out how to express
1781 // tail call optimization in PTX
1782 CLI.IsTailCall = false;
1783 return CallEnd;
1784}
1785
1787 SelectionDAG &DAG) const {
1788
1789 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1790 const Function &Fn = DAG.getMachineFunction().getFunction();
1791
1793 Fn,
1794 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1795 "requires target sm_52.",
1796 SDLoc(Op).getDebugLoc()));
1797 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1798 Op.getOperand(0)};
1799 return DAG.getMergeValues(Ops, SDLoc());
1800 }
1801
1802 SDLoc DL(Op.getNode());
1803 SDValue Chain = Op.getOperand(0);
1804 SDValue Size = Op.getOperand(1);
1805 uint64_t Align = Op.getConstantOperandVal(2);
1806
1807 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1808 // the default stack alignment should be used.
1809 if (Align == 0)
1811
1812 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1813 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1814
1815 SDValue Alloc =
1816 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1817 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1818 DAG.getTargetConstant(Align, DL, MVT::i32)});
1819
1820 SDValue ASC = DAG.getAddrSpaceCast(
1822
1823 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1824}
1825
1827 SelectionDAG &DAG) const {
1828 SDLoc DL(Op.getNode());
1829 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1830 const Function &Fn = DAG.getMachineFunction().getFunction();
1831
1833 Fn,
1834 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1835 ">= sm_52.",
1836 DL.getDebugLoc()));
1837 return Op.getOperand(0);
1838 }
1839
1840 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1841 SDValue Chain = Op.getOperand(0);
1842 SDValue Ptr = Op.getOperand(1);
1845 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1846}
1847
1849 SelectionDAG &DAG) const {
1850 SDLoc DL(Op.getNode());
1851 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1852 const Function &Fn = DAG.getMachineFunction().getFunction();
1853
1855 Fn,
1856 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1857 "sm_52.",
1858 DL.getDebugLoc()));
1859 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1860 return DAG.getMergeValues(Ops, DL);
1861 }
1862
1863 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1864 SDValue Chain = Op.getOperand(0);
1865 SDValue SS =
1866 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1867 SDValue ASC = DAG.getAddrSpaceCast(
1868 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1869 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1870}
1871
1872// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1873// (see LegalizeDAG.cpp). This is slow and uses local memory.
1874// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1875SDValue
1876NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1877 SDNode *Node = Op.getNode();
1878 SDLoc dl(Node);
1880 unsigned NumOperands = Node->getNumOperands();
1881 for (unsigned i = 0; i < NumOperands; ++i) {
1882 SDValue SubOp = Node->getOperand(i);
1883 EVT VVT = SubOp.getNode()->getValueType(0);
1884 EVT EltVT = VVT.getVectorElementType();
1885 unsigned NumSubElem = VVT.getVectorNumElements();
1886 for (unsigned j = 0; j < NumSubElem; ++j) {
1887 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1888 DAG.getIntPtrConstant(j, dl)));
1889 }
1890 }
1891 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1892}
1893
1895 SelectionDAG &DAG,
1896 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1897 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1898 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1899 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1900 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1901}
1902
1904 SelectionDAG &DAG,
1905 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1906 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1907}
1908
1909/// Reduces the elements using the scalar operations provided. The operations
1910/// are sorted descending in number of inputs they take. The flags on the
1911/// original reduction operation will be propagated to each scalar operation.
1912/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1913/// used in ExpandReductions and SelectionDAG.
1915 const SmallVector<SDValue> &Elements, EVT EltTy,
1916 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1917 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1918 // Build the reduction tree at each level, starting with all the elements.
1919 SmallVector<SDValue> Level = Elements;
1920
1921 unsigned OpIdx = 0;
1922 while (Level.size() > 1) {
1923 // Try to reduce this level using the current operator.
1924 const auto [Op, NumInputs] = Ops[OpIdx];
1925
1926 // Build the next level by partially reducing all elements.
1927 SmallVector<SDValue> ReducedLevel;
1928 unsigned I = 0, E = Level.size();
1929 for (; I + NumInputs <= E; I += NumInputs) {
1930 // Reduce elements in groups of [NumInputs], as much as possible.
1931 ReducedLevel.push_back(DAG.getNode(
1932 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1933 }
1934
1935 if (I < E) {
1936 // Handle leftover elements.
1937
1938 if (ReducedLevel.empty()) {
1939 // We didn't reduce anything at this level. We need to pick a smaller
1940 // operator.
1941 ++OpIdx;
1942 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1943 continue;
1944 }
1945
1946 // We reduced some things but there's still more left, meaning the
1947 // operator's number of inputs doesn't evenly divide this level size. Move
1948 // these elements to the next level.
1949 for (; I < E; ++I)
1950 ReducedLevel.push_back(Level[I]);
1951 }
1952
1953 // Process the next level.
1954 Level = ReducedLevel;
1955 }
1956
1957 return *Level.begin();
1958}
1959
1960// Get scalar reduction opcode
1961static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
1962 switch (ReductionOpcode) {
1963 case ISD::VECREDUCE_FMAX:
1964 return ISD::FMAXNUM;
1965 case ISD::VECREDUCE_FMIN:
1966 return ISD::FMINNUM;
1967 case ISD::VECREDUCE_FMAXIMUM:
1968 return ISD::FMAXIMUM;
1969 case ISD::VECREDUCE_FMINIMUM:
1970 return ISD::FMINIMUM;
1971 default:
1972 llvm_unreachable("unhandled reduction opcode");
1973 }
1974}
1975
1976/// Get 3-input scalar reduction opcode
1977static std::optional<NVPTXISD::NodeType>
1978getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
1979 switch (ReductionOpcode) {
1980 case ISD::VECREDUCE_FMAX:
1981 return NVPTXISD::FMAXNUM3;
1982 case ISD::VECREDUCE_FMIN:
1983 return NVPTXISD::FMINNUM3;
1984 case ISD::VECREDUCE_FMAXIMUM:
1985 return NVPTXISD::FMAXIMUM3;
1986 case ISD::VECREDUCE_FMINIMUM:
1987 return NVPTXISD::FMINIMUM3;
1988 default:
1989 return std::nullopt;
1990 }
1991}
1992
1993/// Lower reductions to either a sequence of operations or a tree if
1994/// reassociations are allowed. This method will use larger operations like
1995/// max3/min3 when the target supports them.
1996SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
1997 SelectionDAG &DAG) const {
1998 SDLoc DL(Op);
1999 const SDNodeFlags Flags = Op->getFlags();
2000 SDValue Vector = Op.getOperand(0);
2001
2002 const unsigned Opcode = Op->getOpcode();
2003 const EVT EltTy = Vector.getValueType().getVectorElementType();
2004
2005 // Whether we can use 3-input min/max when expanding the reduction.
2006 const bool CanUseMinMax3 =
2007 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2008 STI.getPTXVersion() >= 88 &&
2009 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2010 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2011
2012 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2013 // number of inputs they take.
2014 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2015
2016 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2017 CanUseMinMax3 && Opcode3Elem)
2018 ScalarOps.push_back({*Opcode3Elem, 3});
2019 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2020
2022 DAG.ExtractVectorElements(Vector, Elements);
2023
2024 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2025}
2026
2027SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2028 // Handle bitcasting from v2i8 without hitting the default promotion
2029 // strategy which goes through stack memory.
2030 EVT FromVT = Op->getOperand(0)->getValueType(0);
2031 if (FromVT != MVT::v2i8) {
2032 return Op;
2033 }
2034
2035 // Pack vector elements into i16 and bitcast to final type
2036 SDLoc DL(Op);
2037 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2038 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2039 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2040 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2041 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2042 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2043 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2044 SDValue AsInt = DAG.getNode(
2045 ISD::OR, DL, MVT::i16,
2046 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2047 EVT ToVT = Op->getValueType(0);
2048 return DAG.getBitcast(ToVT, AsInt);
2049}
2050
2051// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2052// would get lowered as two constant loads and vector-packing move.
2053// Instead we want just a constant move:
2054// mov.b32 %r2, 0x40003C00
2055SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2056 SelectionDAG &DAG) const {
2057 EVT VT = Op->getValueType(0);
2058 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2059 return Op;
2060 SDLoc DL(Op);
2061
2062 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2063 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2064 isa<ConstantFPSDNode>(Operand);
2065 })) {
2066 if (VT != MVT::v4i8)
2067 return Op;
2068 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2069 // to optimize calculation of constant parts.
2070 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2071 uint64_t SelectionValue) -> SDValue {
2072 SDValue L = Left;
2073 SDValue R = Right;
2074 if (Cast) {
2075 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2076 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2077 }
2078 return getPRMT(L, R, SelectionValue, DL, DAG);
2079 };
2080 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2081 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2082 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2083 return DAG.getBitcast(VT, PRMT3210);
2084 }
2085
2086 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2087 auto GetOperand = [](SDValue Op, int N) -> APInt {
2088 const SDValue &Operand = Op->getOperand(N);
2089 EVT VT = Op->getValueType(0);
2090 if (Operand->isUndef())
2091 return APInt(32, 0);
2092 APInt Value;
2093 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2094 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2095 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2096 Value = Operand->getAsAPIntVal();
2097 else
2098 llvm_unreachable("Unsupported type");
2099 // i8 values are carried around as i16, so we need to zero out upper bits,
2100 // so they do not get in the way of combining individual byte values
2101 if (VT == MVT::v4i8)
2102 Value = Value.trunc(8);
2103 return Value.zext(32);
2104 };
2105
2106 // Construct a 32-bit constant by shifting into place smaller values
2107 // (elements of the vector type VT).
2108 // For example, if VT has 2 elements, then N == 2:
2109 // ShiftAmount = 32 / N = 16
2110 // Value |= Op0 (b16) << 0
2111 // Value |= Op1 (b16) << 16
2112 // If N == 4:
2113 // ShiftAmount = 32 / N = 8
2114 // Value |= Op0 (b8) << 0
2115 // Value |= Op1 (b8) << 8
2116 // Value |= Op2 (b8) << 16
2117 // Value |= Op3 (b8) << 24
2118 // ...etc
2119 APInt Value(32, 0);
2120 const unsigned NumElements = VT.getVectorNumElements();
2121 assert(32 % NumElements == 0 && "must evenly divide bit length");
2122 const unsigned ShiftAmount = 32 / NumElements;
2123 for (unsigned ElementNo : seq(NumElements))
2124 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2125 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2126 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2127}
2128
2129SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2130 SelectionDAG &DAG) const {
2131 SDValue Index = Op->getOperand(1);
2132 SDValue Vector = Op->getOperand(0);
2133 SDLoc DL(Op);
2134 EVT VectorVT = Vector.getValueType();
2135
2136 if (VectorVT == MVT::v4i8) {
2137 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2138 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2139 DAG.getConstant(0x7770, DL, MVT::i32));
2140 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2141 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2142 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2143 SDNodeFlags Flags;
2144 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2145 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2146 Ext->setFlags(Flags);
2147 return Ext;
2148 }
2149
2150 // Constant index will be matched by tablegen.
2151 if (isa<ConstantSDNode>(Index.getNode()))
2152 return Op;
2153
2154 // Extract individual elements and select one of them.
2155 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2156 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2157 EVT EltVT = VectorVT.getVectorElementType();
2158
2159 SDLoc dl(Op.getNode());
2160 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2161 DAG.getIntPtrConstant(0, dl));
2162 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2163 DAG.getIntPtrConstant(1, dl));
2164 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2166}
2167
2168SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2169 SelectionDAG &DAG) const {
2170 SDValue Vector = Op->getOperand(0);
2171 EVT VectorVT = Vector.getValueType();
2172
2173 if (VectorVT != MVT::v4i8)
2174 return Op;
2175 SDLoc DL(Op);
2176 SDValue Value = Op->getOperand(1);
2177 if (Value->isUndef())
2178 return Vector;
2179
2180 SDValue Index = Op->getOperand(2);
2181
2182 SDValue BFI =
2183 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2184 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2185 DAG.getNode(ISD::MUL, DL, MVT::i32,
2186 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2187 DAG.getConstant(8, DL, MVT::i32)),
2188 DAG.getConstant(8, DL, MVT::i32)});
2189 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2190}
2191
2192SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2193 SelectionDAG &DAG) const {
2194 SDValue V1 = Op.getOperand(0);
2195 EVT VectorVT = V1.getValueType();
2196 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2197 return Op;
2198
2199 // Lower shuffle to PRMT instruction.
2200 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2201 SDValue V2 = Op.getOperand(1);
2202 uint32_t Selector = 0;
2203 for (auto I : llvm::enumerate(SVN->getMask())) {
2204 if (I.value() != -1) // -1 is a placeholder for undef.
2205 Selector |= (I.value() << (I.index() * 4));
2206 }
2207
2208 SDLoc DL(Op);
2209 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2210 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2211 return DAG.getBitcast(Op.getValueType(), PRMT);
2212}
2213/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2214/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2215/// amount, or
2216/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2217/// amount.
2218SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2219 SelectionDAG &DAG) const {
2220 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2221 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2222
2223 EVT VT = Op.getValueType();
2224 unsigned VTBits = VT.getSizeInBits();
2225 SDLoc dl(Op);
2226 SDValue ShOpLo = Op.getOperand(0);
2227 SDValue ShOpHi = Op.getOperand(1);
2228 SDValue ShAmt = Op.getOperand(2);
2229 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2230
2231 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2232 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2233 // {dHi, dLo} = {aHi, aLo} >> Amt
2234 // dHi = aHi >> Amt
2235 // dLo = shf.r.clamp aLo, aHi, Amt
2236
2237 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2238 SDValue Lo =
2239 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2240
2241 SDValue Ops[2] = { Lo, Hi };
2242 return DAG.getMergeValues(Ops, dl);
2243 }
2244 else {
2245 // {dHi, dLo} = {aHi, aLo} >> Amt
2246 // - if (Amt>=size) then
2247 // dLo = aHi >> (Amt-size)
2248 // dHi = aHi >> Amt (this is either all 0 or all 1)
2249 // else
2250 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2251 // dHi = aHi >> Amt
2252
2253 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2254 DAG.getConstant(VTBits, dl, MVT::i32),
2255 ShAmt);
2256 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2257 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2258 DAG.getConstant(VTBits, dl, MVT::i32));
2259 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2260 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2261 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2262
2263 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2264 DAG.getConstant(VTBits, dl, MVT::i32),
2265 ISD::SETGE);
2266 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2267 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2268
2269 SDValue Ops[2] = { Lo, Hi };
2270 return DAG.getMergeValues(Ops, dl);
2271 }
2272}
2273
2274/// LowerShiftLeftParts - Lower SHL_PARTS, which
2275/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2276/// amount, or
2277/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2278/// amount.
2279SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2280 SelectionDAG &DAG) const {
2281 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2282 assert(Op.getOpcode() == ISD::SHL_PARTS);
2283
2284 EVT VT = Op.getValueType();
2285 unsigned VTBits = VT.getSizeInBits();
2286 SDLoc dl(Op);
2287 SDValue ShOpLo = Op.getOperand(0);
2288 SDValue ShOpHi = Op.getOperand(1);
2289 SDValue ShAmt = Op.getOperand(2);
2290
2291 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2292 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2293 // {dHi, dLo} = {aHi, aLo} << Amt
2294 // dHi = shf.l.clamp aLo, aHi, Amt
2295 // dLo = aLo << Amt
2296
2297 SDValue Hi =
2298 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2299 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2300
2301 SDValue Ops[2] = { Lo, Hi };
2302 return DAG.getMergeValues(Ops, dl);
2303 }
2304 else {
2305 // {dHi, dLo} = {aHi, aLo} << Amt
2306 // - if (Amt>=size) then
2307 // dLo = aLo << Amt (all 0)
2308 // dLo = aLo << (Amt-size)
2309 // else
2310 // dLo = aLo << Amt
2311 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2312
2313 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2314 DAG.getConstant(VTBits, dl, MVT::i32),
2315 ShAmt);
2316 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2317 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2318 DAG.getConstant(VTBits, dl, MVT::i32));
2319 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2320 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2321 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2322
2323 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2324 DAG.getConstant(VTBits, dl, MVT::i32),
2325 ISD::SETGE);
2326 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2327 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2328
2329 SDValue Ops[2] = { Lo, Hi };
2330 return DAG.getMergeValues(Ops, dl);
2331 }
2332}
2333
2334/// If the types match, convert the generic copysign to the NVPTXISD version,
2335/// otherwise bail ensuring that mismatched cases are properly expaned.
2336SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2337 SelectionDAG &DAG) const {
2338 EVT VT = Op.getValueType();
2339 SDLoc DL(Op);
2340
2341 SDValue In1 = Op.getOperand(0);
2342 SDValue In2 = Op.getOperand(1);
2343 EVT SrcVT = In2.getValueType();
2344
2345 if (!SrcVT.bitsEq(VT))
2346 return SDValue();
2347
2348 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2349}
2350
2351SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2352 EVT VT = Op.getValueType();
2353
2354 if (VT == MVT::f32)
2355 return LowerFROUND32(Op, DAG);
2356
2357 if (VT == MVT::f64)
2358 return LowerFROUND64(Op, DAG);
2359
2360 llvm_unreachable("unhandled type");
2361}
2362
2363// This is the the rounding method used in CUDA libdevice in C like code:
2364// float roundf(float A)
2365// {
2366// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2367// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2368// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2369// }
2370SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2371 SelectionDAG &DAG) const {
2372 SDLoc SL(Op);
2373 SDValue A = Op.getOperand(0);
2374 EVT VT = Op.getValueType();
2375
2376 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2377
2378 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2379 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2380 const unsigned SignBitMask = 0x80000000;
2381 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2382 DAG.getConstant(SignBitMask, SL, MVT::i32));
2383 const unsigned PointFiveInBits = 0x3F000000;
2384 SDValue PointFiveWithSignRaw =
2385 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2386 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2387 SDValue PointFiveWithSign =
2388 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2389 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2390 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2391
2392 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2393 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2394 SDValue IsLarge =
2395 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2396 ISD::SETOGT);
2397 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2398
2399 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2400 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2401 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2402 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2403 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2404}
2405
2406// The implementation of round(double) is similar to that of round(float) in
2407// that they both separate the value range into three regions and use a method
2408// specific to the region to round the values. However, round(double) first
2409// calculates the round of the absolute value and then adds the sign back while
2410// round(float) directly rounds the value with sign.
2411SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2412 SelectionDAG &DAG) const {
2413 SDLoc SL(Op);
2414 SDValue A = Op.getOperand(0);
2415 EVT VT = Op.getValueType();
2416
2417 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2418
2419 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2420 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2421 DAG.getConstantFP(0.5, SL, VT));
2422 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2423
2424 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2425 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2426 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2427 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2428 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2429 DAG.getConstantFP(0, SL, VT),
2430 RoundedA);
2431
2432 // Add sign to rounded_A
2433 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2434 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2435
2436 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2437 SDValue IsLarge =
2438 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2439 ISD::SETOGT);
2440 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2441}
2442
2444 EVT VT = N->getValueType(0);
2445 EVT NVT = MVT::f32;
2446 if (VT.isVector()) {
2447 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2448 }
2449 SDLoc DL(N);
2450 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2451 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2452 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2453 return DAG.getFPExtendOrRound(Res, DL, VT);
2454}
2455
2456SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2457 SelectionDAG &DAG) const {
2458 if (useF32FTZ(DAG.getMachineFunction())) {
2459 return PromoteBinOpToF32(Op.getNode(), DAG);
2460 }
2461 return Op;
2462}
2463
2464SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2465 SelectionDAG &DAG) const {
2466 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2467
2468 if (Op.getValueType() == MVT::bf16) {
2469 SDLoc Loc(Op);
2470 return DAG.getNode(
2471 ISD::FP_ROUND, Loc, MVT::bf16,
2472 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2473 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2474 }
2475
2476 // Everything else is considered legal.
2477 return Op;
2478}
2479
2480SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2481 SelectionDAG &DAG) const {
2482 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2483
2484 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2485 SDLoc Loc(Op);
2486 return DAG.getNode(
2487 Op.getOpcode(), Loc, Op.getValueType(),
2488 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2489 }
2490
2491 // Everything else is considered legal.
2492 return Op;
2493}
2494
2495SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2496 SelectionDAG &DAG) const {
2497 EVT NarrowVT = Op.getValueType();
2498 SDValue Wide = Op.getOperand(0);
2499 EVT WideVT = Wide.getValueType();
2500 if (NarrowVT.getScalarType() == MVT::bf16) {
2501 const TargetLowering *TLI = STI.getTargetLowering();
2502 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2503 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2504 }
2505 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2506 // This combination was the first to support f32 -> bf16.
2507 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2508 if (WideVT.getScalarType() == MVT::f32) {
2509 return Op;
2510 }
2511 if (WideVT.getScalarType() == MVT::f64) {
2512 SDLoc Loc(Op);
2513 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2514 // the hardware f32 -> bf16 instruction.
2516 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2517 : MVT::f32,
2518 Wide, Loc, DAG);
2519 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2520 }
2521 }
2522 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2523 }
2524 }
2525
2526 // Everything else is considered legal.
2527 return Op;
2528}
2529
2530SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2531 SelectionDAG &DAG) const {
2532 SDValue Narrow = Op.getOperand(0);
2533 EVT NarrowVT = Narrow.getValueType();
2534 EVT WideVT = Op.getValueType();
2535 if (NarrowVT.getScalarType() == MVT::bf16) {
2536 if (WideVT.getScalarType() == MVT::f32 &&
2537 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2538 SDLoc Loc(Op);
2539 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2540 }
2541 if (WideVT.getScalarType() == MVT::f64 &&
2542 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2543 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2544 : MVT::f32;
2545 SDLoc Loc(Op);
2546 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2547 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2548 } else {
2549 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2550 }
2551 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2552 }
2553 }
2554
2555 // Everything else is considered legal.
2556 return Op;
2557}
2558
2560 SDLoc DL(Op);
2561 if (Op.getValueType() != MVT::v2i16)
2562 return Op;
2563 EVT EltVT = Op.getValueType().getVectorElementType();
2564 SmallVector<SDValue> VecElements;
2565 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2566 SmallVector<SDValue> ScalarArgs;
2567 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2568 [&](const SDUse &O) {
2569 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2570 O.get(), DAG.getIntPtrConstant(I, DL));
2571 });
2572 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2573 }
2574 SDValue V =
2575 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2576 return V;
2577}
2578
2580 SDNode *N = Op.getNode();
2581 SDLoc DL(N);
2583
2584 // split the vector argument
2585 for (size_t I = 0; I < N->getNumOperands(); I++) {
2586 SDValue Val = N->getOperand(I);
2587 EVT ValVT = Val.getValueType();
2588 if (ValVT.isVector()) {
2589 EVT EltVT = ValVT.getVectorElementType();
2590 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2591 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2592 DAG.getIntPtrConstant(J, DL)));
2593 } else
2594 Ops.push_back(Val);
2595 }
2596
2598 SDValue Tcgen05StNode =
2599 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2600 MemSD->getMemoryVT(), MemSD->getMemOperand());
2601
2602 return Tcgen05StNode;
2603}
2604
2606 SDNode *N = Op.getNode();
2607 SDValue Intrin = N->getOperand(1);
2608
2609 // Get the intrinsic ID
2610 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2611 switch (IntrinNo) {
2612 default:
2613 break;
2614 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2615 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2616 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2617 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2618 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2619 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2620 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2621 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2622 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2623 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2624 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2625 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2626 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2627 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2628 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2629 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2630 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2631 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2632 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2633 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2634 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2635 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2636 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2637 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2638 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2639 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2640 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2641 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2642 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2643 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2644 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2645 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2646 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2647 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2648 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2649 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2650 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2651 return LowerTcgen05St(Op, DAG);
2652 }
2653 return Op;
2654}
2655
2657 SelectionDAG &DAG) {
2658
2659 SDNode *N = Op.getNode();
2660 if (N->getOperand(1).getValueType() != MVT::i128) {
2661 // return, if the operand is already lowered
2662 return SDValue();
2663 }
2664
2665 unsigned IID =
2666 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2667 auto Opcode = [&]() {
2668 switch (IID) {
2669 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2671 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2673 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2675 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2677 default:
2678 llvm_unreachable("unsupported/unhandled intrinsic");
2679 }
2680 }();
2681
2682 SDLoc DL(N);
2683 SDValue TryCancelResponse = N->getOperand(1);
2684 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2685 SDValue TryCancelResponse0 =
2686 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2687 DAG.getIntPtrConstant(0, DL));
2688 SDValue TryCancelResponse1 =
2689 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2690 DAG.getIntPtrConstant(1, DL));
2691
2692 return DAG.getNode(Opcode, DL, N->getVTList(),
2693 {TryCancelResponse0, TryCancelResponse1});
2694}
2695
2697 const unsigned Mode = [&]() {
2698 switch (Op->getConstantOperandVal(0)) {
2699 case Intrinsic::nvvm_prmt:
2701 case Intrinsic::nvvm_prmt_b4e:
2703 case Intrinsic::nvvm_prmt_ecl:
2705 case Intrinsic::nvvm_prmt_ecr:
2707 case Intrinsic::nvvm_prmt_f4e:
2709 case Intrinsic::nvvm_prmt_rc16:
2711 case Intrinsic::nvvm_prmt_rc8:
2713 default:
2714 llvm_unreachable("unsupported/unhandled intrinsic");
2715 }
2716 }();
2717 SDLoc DL(Op);
2718 SDValue A = Op->getOperand(1);
2719 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2720 : DAG.getConstant(0, DL, MVT::i32);
2721 SDValue Selector = (Op->op_end() - 1)->get();
2722 return getPRMT(A, B, Selector, DL, DAG, Mode);
2723}
2725 switch (Op->getConstantOperandVal(0)) {
2726 default:
2727 return Op;
2728 case Intrinsic::nvvm_prmt:
2729 case Intrinsic::nvvm_prmt_b4e:
2730 case Intrinsic::nvvm_prmt_ecl:
2731 case Intrinsic::nvvm_prmt_ecr:
2732 case Intrinsic::nvvm_prmt_f4e:
2733 case Intrinsic::nvvm_prmt_rc16:
2734 case Intrinsic::nvvm_prmt_rc8:
2735 return lowerPrmtIntrinsic(Op, DAG);
2736 case Intrinsic::nvvm_internal_addrspace_wrap:
2737 return Op.getOperand(1);
2738 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2739 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2740 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2741 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2743 }
2744}
2745
2746// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
2747// Lower these into a node returning the correct type which is zero-extended
2748// back to the correct size.
2750 SDValue V = Op->getOperand(0);
2751 assert(V.getValueType() == MVT::i64 &&
2752 "Unexpected CTLZ/CTPOP type to legalize");
2753
2754 SDLoc DL(Op);
2755 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
2756 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
2757}
2758
2760 unsigned Opcode, SelectionDAG &DAG) {
2761 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
2762
2763 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
2764 if (!AmtConst)
2765 return SDValue();
2766 const auto Amt = AmtConst->getZExtValue() & 63;
2767
2768 SDValue UnpackA =
2769 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
2770 SDValue UnpackB =
2771 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
2772
2773 // Arch is Little endiain: 0 = low bits, 1 = high bits
2774 SDValue ALo = UnpackA.getValue(0);
2775 SDValue AHi = UnpackA.getValue(1);
2776 SDValue BLo = UnpackB.getValue(0);
2777 SDValue BHi = UnpackB.getValue(1);
2778
2779 // The bitfeild consists of { AHi : ALo : BHi : BLo }
2780 //
2781 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
2782 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
2783 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
2784 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
2785 //
2786 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
2787 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
2788 // on the direction. Amt = 32 can be implemented by a packing and unpacking
2789 // move to select and arrange the 32bit values. For simplicity, these cases
2790 // are not handled here explicitly and instead we rely on DAGCombiner to
2791 // remove the no-op funnel shifts we insert.
2792 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
2793 ? std::make_tuple(AHi, ALo, BHi)
2794 : std::make_tuple(ALo, BHi, BLo);
2795
2796 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
2797 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
2798 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
2799
2800 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
2801}
2802
2804 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
2805 SDLoc(Op), Op->getOpcode(), DAG);
2806}
2807
2809 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
2810 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
2811 SDLoc(Op), Opcode, DAG);
2812}
2813
2815 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
2816 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
2817 // the semantics of LLVM's frem.
2818 SDLoc DL(Op);
2819 SDValue X = Op->getOperand(0);
2820 SDValue Y = Op->getOperand(1);
2821 EVT Ty = Op.getValueType();
2822 SDNodeFlags Flags = Op->getFlags();
2823
2824 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
2825 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
2826 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
2828 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
2830
2831 if (Flags.hasNoInfs())
2832 return Sub;
2833
2834 // If Y is infinite, return X
2835 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
2836 SDValue Inf =
2837 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
2838 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
2839 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
2840}
2841
2843 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2844
2845 SDValue Cond = Op->getOperand(0);
2846 SDValue TrueVal = Op->getOperand(1);
2847 SDValue FalseVal = Op->getOperand(2);
2848 SDLoc DL(Op);
2849
2850 // If both operands are truncated, we push the select through the truncates.
2851 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
2852 FalseVal.getOpcode() == ISD::TRUNCATE) {
2853 TrueVal = TrueVal.getOperand(0);
2854 FalseVal = FalseVal.getOperand(0);
2855
2856 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
2857 ? TrueVal.getValueType()
2858 : FalseVal.getValueType();
2859 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
2860 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
2861 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
2862 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2863 }
2864
2865 // Otherwise, expand the select into a series of logical operations. These
2866 // often can be folded into other operations either by us or ptxas.
2867 TrueVal = DAG.getFreeze(TrueVal);
2868 FalseVal = DAG.getFreeze(FalseVal);
2869 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
2870 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
2871 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
2872 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
2873 return Or;
2874}
2875
2876SDValue
2878 switch (Op.getOpcode()) {
2879 case ISD::RETURNADDR:
2880 return SDValue();
2881 case ISD::FRAMEADDR:
2882 return SDValue();
2883 case ISD::ADDRSPACECAST:
2884 return LowerADDRSPACECAST(Op, DAG);
2886 return Op;
2888 return lowerIntrinsicWOChain(Op, DAG);
2890 return LowerIntrinsicVoid(Op, DAG);
2891 case ISD::BUILD_VECTOR:
2892 return LowerBUILD_VECTOR(Op, DAG);
2893 case ISD::BITCAST:
2894 return LowerBITCAST(Op, DAG);
2896 return Op;
2898 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2900 return LowerINSERT_VECTOR_ELT(Op, DAG);
2902 return LowerVECTOR_SHUFFLE(Op, DAG);
2904 return LowerCONCAT_VECTORS(Op, DAG);
2905 case ISD::VECREDUCE_FMAX:
2906 case ISD::VECREDUCE_FMIN:
2907 case ISD::VECREDUCE_FMAXIMUM:
2908 case ISD::VECREDUCE_FMINIMUM:
2909 return LowerVECREDUCE(Op, DAG);
2910 case ISD::STORE:
2911 return LowerSTORE(Op, DAG);
2912 case ISD::LOAD:
2913 return LowerLOAD(Op, DAG);
2914 case ISD::SHL_PARTS:
2915 return LowerShiftLeftParts(Op, DAG);
2916 case ISD::SRA_PARTS:
2917 case ISD::SRL_PARTS:
2918 return LowerShiftRightParts(Op, DAG);
2919 case ISD::SELECT:
2920 return lowerSELECT(Op, DAG);
2921 case ISD::FROUND:
2922 return LowerFROUND(Op, DAG);
2923 case ISD::FCOPYSIGN:
2924 return LowerFCOPYSIGN(Op, DAG);
2925 case ISD::SINT_TO_FP:
2926 case ISD::UINT_TO_FP:
2927 return LowerINT_TO_FP(Op, DAG);
2928 case ISD::FP_TO_SINT:
2929 case ISD::FP_TO_UINT:
2930 return LowerFP_TO_INT(Op, DAG);
2931 case ISD::FP_ROUND:
2932 return LowerFP_ROUND(Op, DAG);
2933 case ISD::FP_EXTEND:
2934 return LowerFP_EXTEND(Op, DAG);
2935 case ISD::BR_JT:
2936 return LowerBR_JT(Op, DAG);
2937 case ISD::VAARG:
2938 return LowerVAARG(Op, DAG);
2939 case ISD::VASTART:
2940 return LowerVASTART(Op, DAG);
2941 case ISD::FSHL:
2942 case ISD::FSHR:
2943 return lowerFSH(Op, DAG);
2944 case ISD::ROTL:
2945 case ISD::ROTR:
2946 return lowerROT(Op, DAG);
2947 case ISD::ABS:
2948 case ISD::SMIN:
2949 case ISD::SMAX:
2950 case ISD::UMIN:
2951 case ISD::UMAX:
2952 case ISD::ADD:
2953 case ISD::SUB:
2954 case ISD::MUL:
2955 case ISD::SHL:
2956 case ISD::SREM:
2957 case ISD::UREM:
2958 return LowerVectorArith(Op, DAG);
2959 case ISD::DYNAMIC_STACKALLOC:
2960 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2961 case ISD::STACKRESTORE:
2962 return LowerSTACKRESTORE(Op, DAG);
2963 case ISD::STACKSAVE:
2964 return LowerSTACKSAVE(Op, DAG);
2965 case ISD::CopyToReg:
2966 return LowerCopyToReg_128(Op, DAG);
2967 case ISD::FADD:
2968 case ISD::FSUB:
2969 case ISD::FMUL:
2970 // Used only for bf16 on SM80, where we select fma for non-ftz operation
2971 return PromoteBinOpIfF32FTZ(Op, DAG);
2972 case ISD::CTPOP:
2973 case ISD::CTLZ:
2974 return lowerCTLZCTPOP(Op, DAG);
2975 case ISD::FREM:
2976 return lowerFREM(Op, DAG);
2977
2978 default:
2979 llvm_unreachable("Custom lowering not defined for operation");
2980 }
2981}
2982
2983SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
2984 SDLoc DL(Op);
2985 SDValue Chain = Op.getOperand(0);
2986 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
2987 SDValue Index = Op.getOperand(2);
2988
2989 unsigned JId = JT->getIndex();
2991 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2992
2993 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
2994
2995 // Generate BrxStart node
2996 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2997 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
2998
2999 // Generate BrxItem nodes
3000 assert(!MBBs.empty());
3001 for (MachineBasicBlock *MBB : MBBs.drop_back())
3002 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
3003 DAG.getBasicBlock(MBB), Chain.getValue(1));
3004
3005 // Generate BrxEnd nodes
3006 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
3007 IdV, Chain.getValue(1)};
3008 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
3009
3010 return BrxEnd;
3011}
3012
3013// This will prevent AsmPrinter from trying to print the jump tables itself.
3017
3018SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3019 SelectionDAG &DAG) const {
3021 unsigned SrcAS = N->getSrcAddressSpace();
3022 unsigned DestAS = N->getDestAddressSpace();
3023 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3024 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3025 // Shared and SharedCluster can be converted to each other through generic
3026 // space
3027 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3030 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3031 SDLoc DL(Op.getNode());
3032 const MVT GenerictVT =
3034 SDValue GenericConversion = DAG.getAddrSpaceCast(
3035 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3036 SDValue SharedClusterConversion =
3037 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3038 ADDRESS_SPACE_GENERIC, DestAS);
3039 return SharedClusterConversion;
3040 }
3041
3042 return DAG.getUNDEF(Op.getValueType());
3043 }
3044
3045 return Op;
3046}
3047
3048// This function is almost a copy of SelectionDAG::expandVAArg().
3049// The only diff is that this one produces loads from local address space.
3050SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3051 const TargetLowering *TLI = STI.getTargetLowering();
3052 SDLoc DL(Op);
3053
3054 SDNode *Node = Op.getNode();
3055 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3056 EVT VT = Node->getValueType(0);
3057 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3058 SDValue Tmp1 = Node->getOperand(0);
3059 SDValue Tmp2 = Node->getOperand(1);
3060 const MaybeAlign MA(Node->getConstantOperandVal(3));
3061
3062 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3063 Tmp1, Tmp2, MachinePointerInfo(V));
3064 SDValue VAList = VAListLoad;
3065
3066 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3067 VAList = DAG.getNode(
3068 ISD::ADD, DL, VAList.getValueType(), VAList,
3069 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3070
3071 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3072 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3073 VAList.getValueType()));
3074 }
3075
3076 // Increment the pointer, VAList, to the next vaarg
3077 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3079 DL, VAList.getValueType()));
3080
3081 // Store the incremented VAList to the legalized pointer
3082 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3083 MachinePointerInfo(V));
3084
3085 const Value *SrcV = Constant::getNullValue(
3087
3088 // Load the actual argument out of the pointer VAList
3089 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3090}
3091
3092SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3093 const TargetLowering *TLI = STI.getTargetLowering();
3094 SDLoc DL(Op);
3095 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3096
3097 // Store the address of unsized array <function>_vararg[] in the ap object.
3098 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3099
3100 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3101 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3102 MachinePointerInfo(SV));
3103}
3104
3105/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3106static std::optional<std::pair<SDValue, SDValue>>
3109 const EVT ResVT = LD->getValueType(0);
3110 const EVT MemVT = LD->getMemoryVT();
3111
3112 // If we're doing sign/zero extension as part of the load, avoid lowering to
3113 // a LoadV node. TODO: consider relaxing this restriction.
3114 if (ResVT != MemVT)
3115 return std::nullopt;
3116
3117 const auto NumEltsAndEltVT =
3118 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3119 if (!NumEltsAndEltVT)
3120 return std::nullopt;
3121 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3122
3123 Align Alignment = LD->getAlign();
3124 const auto &TD = DAG.getDataLayout();
3125 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3126 if (Alignment < PrefAlign) {
3127 // This load is not sufficiently aligned, so bail out and let this vector
3128 // load be scalarized. Note that we may still be able to emit smaller
3129 // vector loads. For example, if we are loading a <4 x float> with an
3130 // alignment of 8, this check will fail but the legalizer will try again
3131 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3132 return std::nullopt;
3133 }
3134
3135 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3136 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3137 // loaded type to i16 and propagate the "real" type as the memory type.
3138 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3139
3140 unsigned Opcode;
3141 switch (NumElts) {
3142 default:
3143 return std::nullopt;
3144 case 2:
3145 Opcode = NVPTXISD::LoadV2;
3146 break;
3147 case 4:
3148 Opcode = NVPTXISD::LoadV4;
3149 break;
3150 case 8:
3151 Opcode = NVPTXISD::LoadV8;
3152 break;
3153 }
3154 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3155 ListVTs.push_back(MVT::Other);
3156 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3157
3158 SDLoc DL(LD);
3159
3160 // Copy regular operands
3161 SmallVector<SDValue, 8> OtherOps(LD->ops());
3162
3163 // The select routine does not have access to the LoadSDNode instance, so
3164 // pass along the extension information
3165 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
3166
3167 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3168 LD->getMemOperand());
3169
3170 SmallVector<SDValue> ScalarRes;
3171 if (EltVT.isVector()) {
3173 assert(NumElts * EltVT.getVectorNumElements() ==
3174 ResVT.getVectorNumElements());
3175 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3176 // into individual elements.
3177 for (const unsigned I : llvm::seq(NumElts)) {
3178 SDValue SubVector = NewLD.getValue(I);
3179 DAG.ExtractVectorElements(SubVector, ScalarRes);
3180 }
3181 } else {
3182 for (const unsigned I : llvm::seq(NumElts)) {
3183 SDValue Res = NewLD.getValue(I);
3184 if (LoadEltVT != EltVT)
3185 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3186 ScalarRes.push_back(Res);
3187 }
3188 }
3189
3190 SDValue LoadChain = NewLD.getValue(NumElts);
3191
3192 const MVT BuildVecVT =
3193 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3194 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3195 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3196
3197 return {{LoadValue, LoadChain}};
3198}
3199
3202 const NVPTXSubtarget &STI) {
3203 if (auto Res = replaceLoadVector(N, DAG, STI))
3204 Results.append({Res->first, Res->second});
3205}
3206
3208 const NVPTXSubtarget &STI) {
3209 if (auto Res = replaceLoadVector(N, DAG, STI))
3210 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3211 return SDValue();
3212}
3213
3214// v = ld i1* addr
3215// =>
3216// v1 = ld i8* addr (-> i16)
3217// v = trunc i16 to i1
3219 SDLoc dl(LD);
3220 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3221 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3222 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3223 LD->getBasePtr(), LD->getPointerInfo(),
3224 MVT::i8, LD->getAlign(),
3225 LD->getMemOperand()->getFlags());
3226 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3227 // The legalizer (the caller) is expecting two values from the legalized
3228 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3229 // in LegalizeDAG.cpp which also uses MergeValues.
3230 return DAG.getMergeValues({result, LD->getChain()}, dl);
3231}
3232
3233SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3234 LoadSDNode *LD = cast<LoadSDNode>(Op);
3235
3236 if (Op.getValueType() == MVT::i1)
3237 return lowerLOADi1(LD, DAG);
3238
3239 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3240 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3241 // we allow for more DAG combine opportunities.
3242 if (LD->getExtensionType() == ISD::EXTLOAD) {
3243 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3244 "Unexpected fpext-load");
3245 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3246 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3247 LD->getMemOperand());
3248 }
3249
3250 llvm_unreachable("Unexpected custom lowering for load");
3251}
3252
3254 const NVPTXSubtarget &STI) {
3255 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3256 SDValue Val = N->getOperand(1);
3257 SDLoc DL(N);
3258 const EVT ValVT = Val.getValueType();
3259 const EVT MemVT = N->getMemoryVT();
3260
3261 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3262 // TODO: consider relaxing this restriction.
3263 if (ValVT != MemVT)
3264 return SDValue();
3265
3266 const auto NumEltsAndEltVT =
3267 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3268 if (!NumEltsAndEltVT)
3269 return SDValue();
3270 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3271
3272 const DataLayout &TD = DAG.getDataLayout();
3273
3274 Align Alignment = N->getAlign();
3275 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3276 if (Alignment < PrefAlign) {
3277 // This store is not sufficiently aligned, so bail out and let this vector
3278 // store be scalarized. Note that we may still be able to emit smaller
3279 // vector stores. For example, if we are storing a <4 x float> with an
3280 // alignment of 8, this check will fail but the legalizer will try again
3281 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3282 return SDValue();
3283 }
3284
3285 unsigned Opcode;
3286 switch (NumElts) {
3287 default:
3288 return SDValue();
3289 case 2:
3290 Opcode = NVPTXISD::StoreV2;
3291 break;
3292 case 4:
3293 Opcode = NVPTXISD::StoreV4;
3294 break;
3295 case 8:
3296 Opcode = NVPTXISD::StoreV8;
3297 break;
3298 }
3299
3301
3302 // First is the chain
3303 Ops.push_back(N->getOperand(0));
3304
3305 // Then the split values
3306 if (EltVT.isVector()) {
3308 assert(NumElts * EltVT.getVectorNumElements() ==
3309 ValVT.getVectorNumElements());
3310 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3311 // stored as b32s
3312 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3313 for (const unsigned I : llvm::seq(NumElts)) {
3314 SmallVector<SDValue, 4> SubVectorElts;
3315 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3316 NumEltsPerSubVector);
3317 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3318 }
3319 } else {
3320 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3321 for (const unsigned I : llvm::seq(NumElts)) {
3322 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3323 DAG.getIntPtrConstant(I, DL));
3324
3325 // Since StoreV2 is a target node, we cannot rely on DAG type
3326 // legalization. Therefore, we must ensure the type is legal. For i1 and
3327 // i8, we set the stored type to i16 and propagate the "real" type as the
3328 // memory type.
3329 if (EltVT.getSizeInBits() < 16)
3330 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3331 Ops.push_back(ExtVal);
3332 }
3333 }
3334
3335 // Then any remaining arguments
3336 Ops.append(N->op_begin() + 2, N->op_end());
3337
3338 SDValue NewSt =
3339 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3340 N->getMemoryVT(), N->getMemOperand());
3341
3342 // return DCI.CombineTo(N, NewSt, true);
3343 return NewSt;
3344}
3345
3346SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3347 StoreSDNode *Store = cast<StoreSDNode>(Op);
3348 EVT VT = Store->getMemoryVT();
3349
3350 if (VT == MVT::i1)
3351 return LowerSTOREi1(Op, DAG);
3352
3353 // Lower store of any other vector type, including v2f32 as we want to break
3354 // it apart since this is not a widely-supported type.
3355 return lowerSTOREVector(Op, DAG, STI);
3356}
3357
3358// st i1 v, addr
3359// =>
3360// v1 = zxt v to i16
3361// st.u8 i16, addr
3362SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3363 SDNode *Node = Op.getNode();
3364 SDLoc dl(Node);
3365 StoreSDNode *ST = cast<StoreSDNode>(Node);
3366 SDValue Tmp1 = ST->getChain();
3367 SDValue Tmp2 = ST->getBasePtr();
3368 SDValue Tmp3 = ST->getValue();
3369 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3370 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3371 SDValue Result =
3372 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3373 ST->getAlign(), ST->getMemOperand()->getFlags());
3374 return Result;
3375}
3376
3377SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3378 SelectionDAG &DAG) const {
3379 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3380 // operand so that it can pass the legalization.
3381
3382 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3383 "Custom lowering for 128-bit CopyToReg only");
3384
3385 SDNode *Node = Op.getNode();
3386 SDLoc DL(Node);
3387
3388 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3389 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3390 DAG.getIntPtrConstant(0, DL));
3391 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3392 DAG.getIntPtrConstant(1, DL));
3393
3395 SmallVector<EVT, 3> ResultsType(Node->values());
3396
3397 NewOps[0] = Op->getOperand(0); // Chain
3398 NewOps[1] = Op->getOperand(1); // Dst Reg
3399 NewOps[2] = Lo; // Lower 64-bit
3400 NewOps[3] = Hi; // Higher 64-bit
3401 if (Op.getNumOperands() == 4)
3402 NewOps[4] = Op->getOperand(3); // Glue if exists
3403
3404 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3405}
3406
3407unsigned NVPTXTargetLowering::getNumRegisters(
3408 LLVMContext &Context, EVT VT,
3409 std::optional<MVT> RegisterVT = std::nullopt) const {
3410 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3411 return 1;
3412 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3413}
3414
3415bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3416 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3417 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3418 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3419 Parts[0] = Val;
3420 return true;
3421 }
3422 return false;
3423}
3424
3425// This creates target external symbol for a function parameter.
3426// Name of the symbol is composed from its index and the function name.
3427// Negative index corresponds to special parameter (unsized array) used for
3428// passing variable arguments.
3429SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
3430 EVT T) const {
3431 StringRef SavedStr = nvTM->getStrPool().save(
3433 return DAG.getExternalSymbol(SavedStr.data(), T);
3434}
3435
3436SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
3437 EVT T) const {
3438 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
3439 return DAG.getExternalSymbol(SavedStr.data(), T);
3440}
3441
3443 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3444 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3445 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3446 const DataLayout &DL = DAG.getDataLayout();
3447 LLVMContext &Ctx = *DAG.getContext();
3448 auto PtrVT = getPointerTy(DAG.getDataLayout());
3449
3450 const Function &F = DAG.getMachineFunction().getFunction();
3451
3452 SDValue Root = DAG.getRoot();
3453 SmallVector<SDValue, 16> OutChains;
3454
3455 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3456 // Ins.size() will be larger
3457 // * if there is an aggregate argument with multiple fields (each field
3458 // showing up separately in Ins)
3459 // * if there is a vector argument with more than typical vector-length
3460 // elements (generally if more than 4) where each vector element is
3461 // individually present in Ins.
3462 // So a different index should be used for indexing into Ins.
3463 // See similar issue in LowerCall.
3464
3465 auto AllIns = ArrayRef(Ins);
3466 for (const auto &Arg : F.args()) {
3467 const auto ArgIns = AllIns.take_while(
3468 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
3469 AllIns = AllIns.drop_front(ArgIns.size());
3470
3471 Type *Ty = Arg.getType();
3472
3473 if (ArgIns.empty())
3474 report_fatal_error("Empty parameter types are not supported");
3475
3476 if (Arg.use_empty()) {
3477 // argument is dead
3478 for (const auto &In : ArgIns) {
3479 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
3480 InVals.push_back(DAG.getUNDEF(In.VT));
3481 }
3482 continue;
3483 }
3484
3485 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
3486
3487 // In the following cases, assign a node order of "i+1"
3488 // to newly created nodes. The SDNodes for params have to
3489 // appear in the same order as their order of appearance
3490 // in the original function. "i+1" holds that order.
3491 if (Arg.hasByValAttr()) {
3492 // Param has ByVal attribute
3493 // Return MoveParam(param symbol).
3494 // Ideally, the param symbol can be returned directly,
3495 // but when SDNode builder decides to use it in a CopyToReg(),
3496 // machine instruction fails because TargetExternalSymbol
3497 // (not lowered) is target dependent, and CopyToReg assumes
3498 // the source is lowered.
3499 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
3500 const auto &ByvalIn = ArgIns[0];
3501 assert(getValueType(DL, Ty) == ByvalIn.VT &&
3502 "Ins type did not match function type");
3503 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
3504
3505 SDValue P;
3506 if (isKernelFunction(F)) {
3507 P = ArgSymbol;
3508 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3509 } else {
3510 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
3511 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3512 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
3514 }
3515 InVals.push_back(P);
3516 } else {
3519 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
3520 assert(VTs.size() == ArgIns.size() && "Size mismatch");
3521 assert(VTs.size() == Offsets.size() && "Size mismatch");
3522
3523 const Align ArgAlign = getFunctionArgumentAlignment(
3524 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
3525
3526 unsigned I = 0;
3527 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3528 for (const unsigned NumElts : VI) {
3529 // i1 is loaded/stored as i8
3530 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
3531 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
3532
3533 SDValue VecAddr = DAG.getObjectPtrOffset(
3534 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
3535
3536 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
3537 SDValue P =
3538 DAG.getLoad(VecVT, dl, Root, VecAddr,
3542 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3543 for (const unsigned J : llvm::seq(NumElts)) {
3544 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
3545
3546 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
3547 DAG, dl);
3548 InVals.push_back(Elt);
3549 }
3550 I += NumElts;
3551 }
3552 }
3553 }
3554
3555 if (!OutChains.empty())
3556 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
3557
3558 return Chain;
3559}
3560
3561SDValue
3563 bool isVarArg,
3565 const SmallVectorImpl<SDValue> &OutVals,
3566 const SDLoc &dl, SelectionDAG &DAG) const {
3567 const Function &F = DAG.getMachineFunction().getFunction();
3568 Type *RetTy = F.getReturnType();
3569
3570 if (RetTy->isVoidTy()) {
3571 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
3572 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3573 }
3574
3575 const DataLayout &DL = DAG.getDataLayout();
3576 LLVMContext &Ctx = *DAG.getContext();
3577
3578 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
3579 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
3580
3581 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3582 // 32-bits are sign extended or zero extended, depending on whether
3583 // they are signed or unsigned types.
3584 const bool ExtendIntegerRetVal =
3585 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3586
3589 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
3590 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3591
3592 const auto GetRetVal = [&](unsigned I) -> SDValue {
3593 SDValue RetVal = OutVals[I];
3595 RetVal.getValueType() &&
3596 "OutVal type should always be legal");
3597
3598 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
3599 const EVT StoreVT =
3600 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
3601 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
3602 };
3603
3604 unsigned I = 0;
3605 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
3606 for (const unsigned NumElts : VI) {
3607 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
3608 ? MaybeAlign(std::nullopt)
3609 : commonAlignment(RetAlign, Offsets[I]);
3610
3612 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
3613
3614 SDValue Ptr =
3615 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
3616
3617 Chain = DAG.getStore(Chain, dl, Val, Ptr,
3619
3620 I += NumElts;
3621 }
3622
3623 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3624}
3625
3627 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3628 SelectionDAG &DAG) const {
3629 if (Constraint.size() > 1)
3630 return;
3632}
3633
3634// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3635// TgtMemIntrinsic
3636// because we need the information that is only available in the "Value" type
3637// of destination
3638// pointer. In particular, the address space information.
3640 IntrinsicInfo &Info, const CallInst &I,
3641 MachineFunction &MF, unsigned Intrinsic) const {
3642 switch (Intrinsic) {
3643 default:
3644 return false;
3645 case Intrinsic::nvvm_match_all_sync_i32p:
3646 case Intrinsic::nvvm_match_all_sync_i64p:
3647 Info.opc = ISD::INTRINSIC_W_CHAIN;
3648 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3649 // in order to model data exchange with other threads, but perform no real
3650 // memory accesses.
3651 Info.memVT = MVT::i1;
3652
3653 // Our result depends on both our and other thread's arguments.
3655 return true;
3656 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3657 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3658 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3659 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3660 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3661 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3662 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3663 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3664 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3665 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3666 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3667 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3668 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3669 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3670 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3671 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3672 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3673 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3674 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3675 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3676 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3677 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3678 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3679 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3680 Info.opc = ISD::INTRINSIC_W_CHAIN;
3681 Info.memVT = MVT::v8f16;
3682 Info.ptrVal = I.getArgOperand(0);
3683 Info.offset = 0;
3684 Info.flags = MachineMemOperand::MOLoad;
3685 Info.align = Align(16);
3686 return true;
3687 }
3688 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3689 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3690 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3691 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3692 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3693 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3694 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3695 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3696 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3697 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3698 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3699 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3700 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3701 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3702 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3703 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3704 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3705 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3706 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3707 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3708 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3709 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3710 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3711 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3712 Info.opc = ISD::INTRINSIC_W_CHAIN;
3713 Info.memVT = MVT::v2i32;
3714 Info.ptrVal = I.getArgOperand(0);
3715 Info.offset = 0;
3716 Info.flags = MachineMemOperand::MOLoad;
3717 Info.align = Align(8);
3718 return true;
3719 }
3720
3721 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3722 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3723 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3724 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3725 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3726 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3727 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3728 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3729 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3730 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3731 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3732 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3733 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3734 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3735 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3736 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3737
3738 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3739 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3740 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3741 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3742 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3743 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3744 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3745 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3746 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3747 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3748 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3749 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3750 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3751 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3752 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3753 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3754 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3755 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
3756 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
3757 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
3758 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
3759 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
3760 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
3761 Info.opc = ISD::INTRINSIC_W_CHAIN;
3762 Info.memVT = MVT::v4i32;
3763 Info.ptrVal = I.getArgOperand(0);
3764 Info.offset = 0;
3765 Info.flags = MachineMemOperand::MOLoad;
3766 Info.align = Align(16);
3767 return true;
3768 }
3769
3770 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3771 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3772 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3773 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3774 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3775 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3776 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3777 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3778
3779 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3780 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3781 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3782 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3783 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3784 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3785 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3786 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3787 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3788 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3789 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3790 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3791 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3792 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3793 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3794 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3795 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3796 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3797 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3798 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3799 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3800 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
3801 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
3802 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
3803 Info.opc = ISD::INTRINSIC_W_CHAIN;
3804 Info.memVT = MVT::i32;
3805 Info.ptrVal = I.getArgOperand(0);
3806 Info.offset = 0;
3807 Info.flags = MachineMemOperand::MOLoad;
3808 Info.align = Align(4);
3809 return true;
3810 }
3811
3812 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3813 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3814 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3815 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3816 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3817 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3818 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3819 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3820 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3821 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3822 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3823 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3824 Info.opc = ISD::INTRINSIC_W_CHAIN;
3825 Info.memVT = MVT::v4f16;
3826 Info.ptrVal = I.getArgOperand(0);
3827 Info.offset = 0;
3828 Info.flags = MachineMemOperand::MOLoad;
3829 Info.align = Align(16);
3830 return true;
3831 }
3832
3833 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3834 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3835 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3836 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3837 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3838 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3839 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3840 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3841 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3842 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3843 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3844 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3845 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3846 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3847 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3848 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3849 Info.opc = ISD::INTRINSIC_W_CHAIN;
3850 Info.memVT = MVT::v8f32;
3851 Info.ptrVal = I.getArgOperand(0);
3852 Info.offset = 0;
3853 Info.flags = MachineMemOperand::MOLoad;
3854 Info.align = Align(16);
3855 return true;
3856 }
3857
3858 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3859 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3860 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3861 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3862
3863 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3864 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3865 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3866 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3867
3868 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3869 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3870 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3871 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3872 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3873 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3874 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3875 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3876 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3877 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3878 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3879 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3880 Info.opc = ISD::INTRINSIC_W_CHAIN;
3881 Info.memVT = MVT::v8i32;
3882 Info.ptrVal = I.getArgOperand(0);
3883 Info.offset = 0;
3884 Info.flags = MachineMemOperand::MOLoad;
3885 Info.align = Align(16);
3886 return true;
3887 }
3888
3889 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3890 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3891 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3892 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3893 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3894 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3895 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3896 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3897 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3898 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
3899 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
3900 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
3901 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
3902 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
3903 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
3904 Info.opc = ISD::INTRINSIC_W_CHAIN;
3905 Info.memVT = MVT::v2i32;
3906 Info.ptrVal = I.getArgOperand(0);
3907 Info.offset = 0;
3908 Info.flags = MachineMemOperand::MOLoad;
3909 Info.align = Align(8);
3910 return true;
3911 }
3912
3913 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3914 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3915 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3916 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3917
3918 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3919 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3920 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3921 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3922 Info.opc = ISD::INTRINSIC_W_CHAIN;
3923 Info.memVT = MVT::f64;
3924 Info.ptrVal = I.getArgOperand(0);
3925 Info.offset = 0;
3926 Info.flags = MachineMemOperand::MOLoad;
3927 Info.align = Align(8);
3928 return true;
3929 }
3930
3931 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3932 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3933 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3934 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3935 Info.opc = ISD::INTRINSIC_W_CHAIN;
3936 Info.memVT = MVT::v2f64;
3937 Info.ptrVal = I.getArgOperand(0);
3938 Info.offset = 0;
3939 Info.flags = MachineMemOperand::MOLoad;
3940 Info.align = Align(16);
3941 return true;
3942 }
3943
3944 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3945 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3946 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3947 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3948 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3949 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3950 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3951 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3952 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3953 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3954 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3955 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3956 Info.opc = ISD::INTRINSIC_VOID;
3957 Info.memVT = MVT::v4f16;
3958 Info.ptrVal = I.getArgOperand(0);
3959 Info.offset = 0;
3960 Info.flags = MachineMemOperand::MOStore;
3961 Info.align = Align(16);
3962 return true;
3963 }
3964
3965 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3966 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3967 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3968 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3969 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3970 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3971 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3972 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3973 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3974 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3975 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3976 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3977 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3978 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3979 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3980 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3981 Info.opc = ISD::INTRINSIC_VOID;
3982 Info.memVT = MVT::v8f32;
3983 Info.ptrVal = I.getArgOperand(0);
3984 Info.offset = 0;
3985 Info.flags = MachineMemOperand::MOStore;
3986 Info.align = Align(16);
3987 return true;
3988 }
3989
3990 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3991 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3992 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3993 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3994 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3995 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3996 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3997 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3998 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3999 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4000 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4001 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4002 Info.opc = ISD::INTRINSIC_VOID;
4003 Info.memVT = MVT::v8i32;
4004 Info.ptrVal = I.getArgOperand(0);
4005 Info.offset = 0;
4006 Info.flags = MachineMemOperand::MOStore;
4007 Info.align = Align(16);
4008 return true;
4009 }
4010
4011 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4012 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4013 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4014 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4015 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4016 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4017 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4018 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4019 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4020 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4021 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4022 Info.opc = ISD::INTRINSIC_VOID;
4023 Info.memVT = MVT::v2i32;
4024 Info.ptrVal = I.getArgOperand(0);
4025 Info.offset = 0;
4026 Info.flags = MachineMemOperand::MOStore;
4027 Info.align = Align(8);
4028 return true;
4029 }
4030
4031 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4032 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4033 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4034 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4035 Info.opc = ISD::INTRINSIC_VOID;
4036 Info.memVT = MVT::v2f64;
4037 Info.ptrVal = I.getArgOperand(0);
4038 Info.offset = 0;
4039 Info.flags = MachineMemOperand::MOStore;
4040 Info.align = Align(16);
4041 return true;
4042 }
4043
4044 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4045 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4046 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4047 Info.opc = ISD::INTRINSIC_VOID;
4048 Info.memVT = MVT::i32;
4049 Info.ptrVal = I.getArgOperand(0);
4050 Info.offset = 0;
4051 Info.flags = MachineMemOperand::MOStore;
4052 Info.align = Align(4);
4053 return true;
4054 }
4055
4056 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4057 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4058 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4059 Info.opc = ISD::INTRINSIC_VOID;
4060 Info.memVT = MVT::v4i32;
4061 Info.ptrVal = I.getArgOperand(0);
4062 Info.offset = 0;
4063 Info.flags = MachineMemOperand::MOStore;
4064 Info.align = Align(16);
4065 return true;
4066 }
4067
4068 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4069 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4070 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4071 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4072 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4073 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4074 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4075 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4076 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4077 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4078 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4079 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4080 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4081 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4082 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4083 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4084 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4085 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4086 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4087 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4088 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4089 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4090 auto &DL = I.getDataLayout();
4091 Info.opc = ISD::INTRINSIC_W_CHAIN;
4092 Info.memVT = getValueType(DL, I.getType());
4093 Info.ptrVal = I.getArgOperand(0);
4094 Info.offset = 0;
4096 Info.align.reset();
4097 return true;
4098 }
4099
4100 case Intrinsic::nvvm_prefetch_tensormap: {
4101 auto &DL = I.getDataLayout();
4102 Info.opc = ISD::INTRINSIC_VOID;
4103 Info.memVT = getPointerTy(DL);
4104 Info.ptrVal = I.getArgOperand(0);
4105 Info.offset = 0;
4106 Info.flags =
4108 Info.align.reset();
4109 return true;
4110 }
4111
4112 case Intrinsic::nvvm_ldu_global_i:
4113 case Intrinsic::nvvm_ldu_global_f:
4114 case Intrinsic::nvvm_ldu_global_p: {
4115 Info.opc = ISD::INTRINSIC_W_CHAIN;
4116 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4117 Info.ptrVal = I.getArgOperand(0);
4118 Info.offset = 0;
4119 Info.flags = MachineMemOperand::MOLoad;
4120 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4121
4122 return true;
4123 }
4124 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4125 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4126 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4127 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4128 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4129 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4130 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4131 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4132 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4133 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4134 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4135 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4136 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4137 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4138 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4139 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4140 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4141 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4142 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4143 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4144 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4145 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4146 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4147 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4148 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4149 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4150 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4151 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4152 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4153 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4154 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4155 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4156 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4157 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4158 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4159 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4160 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4161 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4162 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4163 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4164 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4165 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4166 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4167 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4168 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4169 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4170 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4171 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4172 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4173 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4174 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4175 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4176 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4177 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4178 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4179 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4180 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4181 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4182 Info.opc = ISD::INTRINSIC_W_CHAIN;
4183 Info.memVT = MVT::v4f32;
4184 Info.ptrVal = nullptr;
4185 Info.offset = 0;
4186 Info.flags = MachineMemOperand::MOLoad;
4187 Info.align = Align(16);
4188 return true;
4189
4190 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4191 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4192 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4193 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4194 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4195 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4196 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4197 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4198 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4199 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4200 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4201 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4202 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4203 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4204 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4205 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4206 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4207 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4208 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4209 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4210 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4211 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4212 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4213 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4214 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4215 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4216 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4217 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4218 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4219 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4220 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4221 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4222 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4223 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4224 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4225 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4226 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4227 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4228 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4229 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4230 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4231 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4232 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4233 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4234 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4235 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4236 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4237 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4238 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4239 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4240 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4241 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4242 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4243 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4244 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4245 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4246 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4247 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4248 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4249 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4250 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4251 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4252 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4253 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4254 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4255 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4256 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4257 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4258 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4259 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4260 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4261 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4262 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4263 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4264 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4265 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4266 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4267 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4268 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4269 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4270 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4271 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4272 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4273 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4274 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4275 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4276 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4277 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4278 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4279 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4280 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4281 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4282 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4283 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4284 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4285 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4286 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4287 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4288 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4289 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4290 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4291 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4292 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4293 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4294 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4295 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4296 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4297 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4298 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4299 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4300 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4301 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4302 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4303 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4304 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4305 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4306 Info.opc = ISD::INTRINSIC_W_CHAIN;
4307 Info.memVT = MVT::v4i32;
4308 Info.ptrVal = nullptr;
4309 Info.offset = 0;
4310 Info.flags = MachineMemOperand::MOLoad;
4311 Info.align = Align(16);
4312 return true;
4313
4314 case Intrinsic::nvvm_suld_1d_i8_clamp:
4315 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4316 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4317 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4318 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4319 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4320 case Intrinsic::nvvm_suld_2d_i8_clamp:
4321 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4322 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4323 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4324 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4325 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4326 case Intrinsic::nvvm_suld_3d_i8_clamp:
4327 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4328 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4329 case Intrinsic::nvvm_suld_1d_i8_trap:
4330 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4331 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4332 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4333 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4334 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4335 case Intrinsic::nvvm_suld_2d_i8_trap:
4336 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4337 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4338 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4339 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4340 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4341 case Intrinsic::nvvm_suld_3d_i8_trap:
4342 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4343 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4344 case Intrinsic::nvvm_suld_1d_i8_zero:
4345 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4346 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4347 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4348 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4349 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4350 case Intrinsic::nvvm_suld_2d_i8_zero:
4351 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4352 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4353 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4354 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4355 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4356 case Intrinsic::nvvm_suld_3d_i8_zero:
4357 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4358 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4359 Info.opc = ISD::INTRINSIC_W_CHAIN;
4360 Info.memVT = MVT::i8;
4361 Info.ptrVal = nullptr;
4362 Info.offset = 0;
4363 Info.flags = MachineMemOperand::MOLoad;
4364 Info.align = Align(16);
4365 return true;
4366
4367 case Intrinsic::nvvm_suld_1d_i16_clamp:
4368 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4369 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4370 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4371 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4372 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4373 case Intrinsic::nvvm_suld_2d_i16_clamp:
4374 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4375 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4376 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4377 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4378 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4379 case Intrinsic::nvvm_suld_3d_i16_clamp:
4380 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4381 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4382 case Intrinsic::nvvm_suld_1d_i16_trap:
4383 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4384 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4385 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4386 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4387 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4388 case Intrinsic::nvvm_suld_2d_i16_trap:
4389 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4390 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4391 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4392 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4393 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4394 case Intrinsic::nvvm_suld_3d_i16_trap:
4395 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4396 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4397 case Intrinsic::nvvm_suld_1d_i16_zero:
4398 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4399 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4400 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4401 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4402 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4403 case Intrinsic::nvvm_suld_2d_i16_zero:
4404 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4405 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4406 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4407 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4408 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4409 case Intrinsic::nvvm_suld_3d_i16_zero:
4410 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4411 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4412 Info.opc = ISD::INTRINSIC_W_CHAIN;
4413 Info.memVT = MVT::i16;
4414 Info.ptrVal = nullptr;
4415 Info.offset = 0;
4416 Info.flags = MachineMemOperand::MOLoad;
4417 Info.align = Align(16);
4418 return true;
4419
4420 case Intrinsic::nvvm_suld_1d_i32_clamp:
4421 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4422 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4423 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4424 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4425 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4426 case Intrinsic::nvvm_suld_2d_i32_clamp:
4427 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4428 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4429 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4430 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4431 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4432 case Intrinsic::nvvm_suld_3d_i32_clamp:
4433 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4434 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4435 case Intrinsic::nvvm_suld_1d_i32_trap:
4436 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4437 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4438 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4439 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4440 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4441 case Intrinsic::nvvm_suld_2d_i32_trap:
4442 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4443 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4444 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4445 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4446 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4447 case Intrinsic::nvvm_suld_3d_i32_trap:
4448 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4449 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4450 case Intrinsic::nvvm_suld_1d_i32_zero:
4451 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4452 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4453 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4454 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4455 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4456 case Intrinsic::nvvm_suld_2d_i32_zero:
4457 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4458 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4459 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4460 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4461 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4462 case Intrinsic::nvvm_suld_3d_i32_zero:
4463 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4464 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4465 Info.opc = ISD::INTRINSIC_W_CHAIN;
4466 Info.memVT = MVT::i32;
4467 Info.ptrVal = nullptr;
4468 Info.offset = 0;
4469 Info.flags = MachineMemOperand::MOLoad;
4470 Info.align = Align(16);
4471 return true;
4472
4473 case Intrinsic::nvvm_suld_1d_i64_clamp:
4474 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4475 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4476 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4477 case Intrinsic::nvvm_suld_2d_i64_clamp:
4478 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4479 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4480 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4481 case Intrinsic::nvvm_suld_3d_i64_clamp:
4482 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4483 case Intrinsic::nvvm_suld_1d_i64_trap:
4484 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4485 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4486 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4487 case Intrinsic::nvvm_suld_2d_i64_trap:
4488 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4489 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4490 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4491 case Intrinsic::nvvm_suld_3d_i64_trap:
4492 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4493 case Intrinsic::nvvm_suld_1d_i64_zero:
4494 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4495 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4496 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4497 case Intrinsic::nvvm_suld_2d_i64_zero:
4498 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4499 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4500 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4501 case Intrinsic::nvvm_suld_3d_i64_zero:
4502 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4503 Info.opc = ISD::INTRINSIC_W_CHAIN;
4504 Info.memVT = MVT::i64;
4505 Info.ptrVal = nullptr;
4506 Info.offset = 0;
4507 Info.flags = MachineMemOperand::MOLoad;
4508 Info.align = Align(16);
4509 return true;
4510
4511 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
4512 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
4513 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
4514 Info.opc = ISD::INTRINSIC_W_CHAIN;
4515 Info.memVT = MVT::v1i32;
4516 Info.ptrVal = I.getArgOperand(0);
4517 Info.offset = 0;
4518 Info.flags = MachineMemOperand::MOLoad;
4519 Info.align.reset();
4520 return true;
4521 }
4522
4523 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
4524 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
4525 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
4526 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
4527 Info.opc = ISD::INTRINSIC_W_CHAIN;
4528 Info.memVT = MVT::v2i32;
4529 Info.ptrVal = I.getArgOperand(0);
4530 Info.offset = 0;
4531 Info.flags = MachineMemOperand::MOLoad;
4532 Info.align.reset();
4533 return true;
4534 }
4535
4536 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
4537 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
4538 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
4539 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
4540 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
4541 Info.opc = ISD::INTRINSIC_W_CHAIN;
4542 Info.memVT = MVT::v4i32;
4543 Info.ptrVal = I.getArgOperand(0);
4544 Info.offset = 0;
4545 Info.flags = MachineMemOperand::MOLoad;
4546 Info.align.reset();
4547 return true;
4548 }
4549
4550 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
4551 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
4552 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
4553 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
4554 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
4555 Info.opc = ISD::INTRINSIC_W_CHAIN;
4556 Info.memVT = MVT::v8i32;
4557 Info.ptrVal = I.getArgOperand(0);
4558 Info.offset = 0;
4559 Info.flags = MachineMemOperand::MOLoad;
4560 Info.align.reset();
4561 return true;
4562 }
4563
4564 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
4565 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
4566 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
4567 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
4568 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
4569 Info.opc = ISD::INTRINSIC_W_CHAIN;
4570 Info.memVT = MVT::v16i32;
4571 Info.ptrVal = I.getArgOperand(0);
4572 Info.offset = 0;
4573 Info.flags = MachineMemOperand::MOLoad;
4574 Info.align.reset();
4575 return true;
4576 }
4577
4578 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
4579 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
4580 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
4581 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
4582 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
4583 Info.opc = ISD::INTRINSIC_W_CHAIN;
4584 Info.memVT = MVT::v32i32;
4585 Info.ptrVal = I.getArgOperand(0);
4586 Info.offset = 0;
4587 Info.flags = MachineMemOperand::MOLoad;
4588 Info.align.reset();
4589 return true;
4590 }
4591
4592 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
4593 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
4594 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
4595 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
4596 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
4597 Info.opc = ISD::INTRINSIC_W_CHAIN;
4598 Info.memVT = MVT::v64i32;
4599 Info.ptrVal = I.getArgOperand(0);
4600 Info.offset = 0;
4601 Info.flags = MachineMemOperand::MOLoad;
4602 Info.align.reset();
4603 return true;
4604 }
4605
4606 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
4607 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
4608 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
4609 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
4610 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
4611 Info.opc = ISD::INTRINSIC_W_CHAIN;
4612 Info.memVT = MVT::v128i32;
4613 Info.ptrVal = I.getArgOperand(0);
4614 Info.offset = 0;
4615 Info.flags = MachineMemOperand::MOLoad;
4616 Info.align.reset();
4617 return true;
4618 }
4619
4620 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
4621 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
4622 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
4623 Info.opc = ISD::INTRINSIC_VOID;
4624 Info.memVT = MVT::i32;
4625 Info.ptrVal = I.getArgOperand(0);
4626 Info.offset = 0;
4627 Info.flags = MachineMemOperand::MOStore;
4628 Info.align.reset();
4629 return true;
4630 }
4631
4632 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
4633 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
4634 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
4635 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
4636 Info.opc = ISD::INTRINSIC_VOID;
4637 Info.memVT = MVT::v2i32;
4638 Info.ptrVal = I.getArgOperand(0);
4639 Info.offset = 0;
4640 Info.flags = MachineMemOperand::MOStore;
4641 Info.align.reset();
4642 return true;
4643 }
4644
4645 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
4646 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
4647 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
4648 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
4649 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
4650 Info.opc = ISD::INTRINSIC_VOID;
4651 Info.memVT = MVT::v4i32;
4652 Info.ptrVal = I.getArgOperand(0);
4653 Info.offset = 0;
4654 Info.flags = MachineMemOperand::MOStore;
4655 Info.align.reset();
4656 return true;
4657 }
4658
4659 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
4660 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
4661 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
4662 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
4663 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
4664 Info.opc = ISD::INTRINSIC_VOID;
4665 Info.memVT = MVT::v8i32;
4666 Info.ptrVal = I.getArgOperand(0);
4667 Info.offset = 0;
4668 Info.flags = MachineMemOperand::MOStore;
4669 Info.align.reset();
4670 return true;
4671 }
4672
4673 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
4674 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
4675 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
4676 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
4677 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
4678 Info.opc = ISD::INTRINSIC_VOID;
4679 Info.memVT = MVT::v16i32;
4680 Info.ptrVal = I.getArgOperand(0);
4681 Info.offset = 0;
4682 Info.flags = MachineMemOperand::MOStore;
4683 Info.align.reset();
4684 return true;
4685 }
4686
4687 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
4688 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
4689 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
4690 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
4691 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
4692 Info.opc = ISD::INTRINSIC_VOID;
4693 Info.memVT = MVT::v32i32;
4694 Info.ptrVal = I.getArgOperand(0);
4695 Info.offset = 0;
4696 Info.flags = MachineMemOperand::MOStore;
4697 Info.align.reset();
4698 return true;
4699 }
4700
4701 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
4702 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
4703 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
4704 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
4705 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
4706 Info.opc = ISD::INTRINSIC_VOID;
4707 Info.memVT = MVT::v64i32;
4708 Info.ptrVal = I.getArgOperand(0);
4709 Info.offset = 0;
4710 Info.flags = MachineMemOperand::MOStore;
4711 Info.align.reset();
4712 return true;
4713 }
4714
4715 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
4716 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
4717 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
4718 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
4719 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
4720 Info.opc = ISD::INTRINSIC_VOID;
4721 Info.memVT = MVT::v128i32;
4722 Info.ptrVal = I.getArgOperand(0);
4723 Info.offset = 0;
4724 Info.flags = MachineMemOperand::MOStore;
4725 Info.align.reset();
4726 return true;
4727 }
4728 }
4729 return false;
4730}
4731
4732/// getFunctionParamOptimizedAlign - since function arguments are passed via
4733/// .param space, we may want to increase their alignment in a way that
4734/// ensures that we can effectively vectorize their loads & stores. We can
4735/// increase alignment only if the function has internal or has private
4736/// linkage as for other linkage types callers may already rely on default
4737/// alignment. To allow using 128-bit vectorized loads/stores, this function
4738/// ensures that alignment is 16 or greater.
4740 const Function *F, Type *ArgTy, const DataLayout &DL) const {
4741 // Capping the alignment to 128 bytes as that is the maximum alignment
4742 // supported by PTX.
4743 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
4744
4745 // If a function has linkage different from internal or private, we
4746 // must use default ABI alignment as external users rely on it. Same
4747 // for a function that may be called from a function pointer.
4748 if (!F || !F->hasLocalLinkage() ||
4749 F->hasAddressTaken(/*Users=*/nullptr,
4750 /*IgnoreCallbackUses=*/false,
4751 /*IgnoreAssumeLikeCalls=*/true,
4752 /*IgnoreLLVMUsed=*/true))
4753 return ABITypeAlign;
4754
4755 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4756 return std::max(Align(16), ABITypeAlign);
4757}
4758
4759/// Helper for computing alignment of a device function byval parameter.
4761 const Function *F, Type *ArgTy, Align InitialAlign,
4762 const DataLayout &DL) const {
4763 Align ArgAlign = InitialAlign;
4764 // Try to increase alignment to enhance vectorization options.
4765 if (F)
4766 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4767
4768 // Old ptx versions have a bug. When PTX code takes address of
4769 // byval parameter with alignment < 4, ptxas generates code to
4770 // spill argument into memory. Alas on sm_50+ ptxas generates
4771 // SASS code that fails with misaligned access. To work around
4772 // the problem, make sure that we align byval parameters by at
4773 // least 4. This bug seems to be fixed at least starting from
4774 // ptxas > 9.0.
4775 // TODO: remove this after verifying the bug is not reproduced
4776 // on non-deprecated ptxas versions.
4778 ArgAlign = std::max(ArgAlign, Align(4));
4779
4780 return ArgAlign;
4781}
4782
4783// Helper for getting a function parameter name. Name is composed from
4784// its index and the function name. Negative index corresponds to special
4785// parameter (unsized array) used for passing variable arguments.
4787 int Idx) const {
4788 std::string ParamName;
4789 raw_string_ostream ParamStr(ParamName);
4790
4791 ParamStr << getTargetMachine().getSymbol(F)->getName();
4792 if (Idx < 0)
4793 ParamStr << "_vararg";
4794 else
4795 ParamStr << "_param_" << Idx;
4796
4797 return ParamName;
4798}
4799
4800/// isLegalAddressingMode - Return true if the addressing mode represented
4801/// by AM is legal for this target, for a load/store of the specified type.
4802/// Used to guide target specific optimizations, like loop strength reduction
4803/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4804/// (CodeGenPrepare.cpp)
4806 const AddrMode &AM, Type *Ty,
4807 unsigned AS, Instruction *I) const {
4808 // AddrMode - This represents an addressing mode of:
4809 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4810 //
4811 // The legal address modes are
4812 // - [avar]
4813 // - [areg]
4814 // - [areg+immoff]
4815 // - [immAddr]
4816
4817 // immoff must fit in a signed 32-bit int
4818 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
4819 return false;
4820
4821 if (AM.BaseGV)
4822 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4823
4824 switch (AM.Scale) {
4825 case 0: // "r", "r+i" or "i" is allowed
4826 break;
4827 case 1:
4828 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4829 return false;
4830 // Otherwise we have r+i.
4831 break;
4832 default:
4833 // No scale > 1 is allowed
4834 return false;
4835 }
4836 return true;
4837}
4838
4839//===----------------------------------------------------------------------===//
4840// NVPTX Inline Assembly Support
4841//===----------------------------------------------------------------------===//
4842
4843/// getConstraintType - Given a constraint letter, return the type of
4844/// constraint it is for this target.
4847 if (Constraint.size() == 1) {
4848 switch (Constraint[0]) {
4849 default:
4850 break;
4851 case 'b':
4852 case 'r':
4853 case 'h':
4854 case 'c':
4855 case 'l':
4856 case 'f':
4857 case 'd':
4858 case 'q':
4859 case '0':
4860 case 'N':
4861 return C_RegisterClass;
4862 }
4863 }
4864 return TargetLowering::getConstraintType(Constraint);
4865}
4866
4867std::pair<unsigned, const TargetRegisterClass *>
4869 StringRef Constraint,
4870 MVT VT) const {
4871 if (Constraint.size() == 1) {
4872 switch (Constraint[0]) {
4873 case 'b':
4874 return std::make_pair(0U, &NVPTX::B1RegClass);
4875 case 'c':
4876 case 'h':
4877 return std::make_pair(0U, &NVPTX::B16RegClass);
4878 case 'r':
4879 case 'f':
4880 return std::make_pair(0U, &NVPTX::B32RegClass);
4881 case 'l':
4882 case 'N':
4883 case 'd':
4884 return std::make_pair(0U, &NVPTX::B64RegClass);
4885 case 'q': {
4886 if (STI.getSmVersion() < 70)
4887 report_fatal_error("Inline asm with 128 bit operands is only "
4888 "supported for sm_70 and higher!");
4889 return std::make_pair(0U, &NVPTX::B128RegClass);
4890 }
4891 }
4892 }
4893 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4894}
4895
4896//===----------------------------------------------------------------------===//
4897// NVPTX DAG Combining
4898//===----------------------------------------------------------------------===//
4899
4901 CodeGenOptLevel OptLevel) const {
4902 // Always honor command-line argument
4903 if (FMAContractLevelOpt.getNumOccurrences() > 0)
4904 return FMAContractLevelOpt > 0;
4905
4906 // Do not contract if we're not optimizing the code.
4907 if (OptLevel == CodeGenOptLevel::None)
4908 return false;
4909
4910 // Honor TargetOptions flags that explicitly say fusion is okay.
4912 return true;
4913
4914 return false;
4915}
4916
4917static bool isConstZero(const SDValue &Operand) {
4918 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
4919 return Const && Const->getZExtValue() == 0;
4920}
4921
4922/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4923/// operands N0 and N1. This is a helper for PerformADDCombine that is
4924/// called with the default operands, and if that fails, with commuted
4925/// operands.
4926static SDValue
4929 EVT VT = N0.getValueType();
4930
4931 // Since integer multiply-add costs the same as integer multiply
4932 // but is more costly than integer add, do the fusion only when
4933 // the mul is only used in the add.
4934 // TODO: this may not be true for later architectures, consider relaxing this
4935 if (!N0.getNode()->hasOneUse())
4936 return SDValue();
4937
4938 // fold (add (select cond, 0, (mul a, b)), c)
4939 // -> (select cond, c, (add (mul a, b), c))
4940 //
4941 if (N0.getOpcode() == ISD::SELECT) {
4942 unsigned ZeroOpNum;
4943 if (isConstZero(N0->getOperand(1)))
4944 ZeroOpNum = 1;
4945 else if (isConstZero(N0->getOperand(2)))
4946 ZeroOpNum = 2;
4947 else
4948 return SDValue();
4949
4950 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
4951 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
4952 return SDValue();
4953
4954 SDLoc DL(N);
4955 SDValue Mul =
4956 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
4957 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
4958 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
4959 ((ZeroOpNum == 1) ? N1 : MAD),
4960 ((ZeroOpNum == 1) ? MAD : N1));
4961 }
4962
4963 return SDValue();
4964}
4965
4966static SDValue
4969 CodeGenOptLevel OptLevel) {
4970 EVT VT = N0.getValueType();
4971 if (N0.getOpcode() == ISD::FMUL) {
4972 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4973 &DCI.DAG.getTargetLoweringInfo());
4974 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
4975 (N->getFlags().hasAllowContract() &&
4976 N0->getFlags().hasAllowContract())))
4977 return SDValue();
4978
4979 // For floating point:
4980 // Do the fusion only when the mul has less than 5 uses and all
4981 // are add.
4982 // The heuristic is that if a use is not an add, then that use
4983 // cannot be fused into fma, therefore mul is still needed anyway.
4984 // If there are more than 4 uses, even if they are all add, fusing
4985 // them will increase register pressue.
4986 //
4987 int numUses = 0;
4988 int nonAddCount = 0;
4989 for (const SDNode *User : N0.getNode()->users()) {
4990 numUses++;
4991 if (User->getOpcode() != ISD::FADD)
4992 ++nonAddCount;
4993 if (numUses >= 5)
4994 return SDValue();
4995 }
4996 if (nonAddCount) {
4997 int orderNo = N->getIROrder();
4998 int orderNo2 = N0.getNode()->getIROrder();
4999 // simple heuristics here for considering potential register
5000 // pressure, the logics here is that the differnce are used
5001 // to measure the distance between def and use, the longer distance
5002 // more likely cause register pressure.
5003 if (orderNo - orderNo2 < 500)
5004 return SDValue();
5005
5006 // Now, check if at least one of the FMUL's operands is live beyond the
5007 // node N, which guarantees that the FMA will not increase register
5008 // pressure at node N.
5009 bool opIsLive = false;
5010 const SDNode *left = N0.getOperand(0).getNode();
5011 const SDNode *right = N0.getOperand(1).getNode();
5012
5013 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5014 opIsLive = true;
5015
5016 if (!opIsLive)
5017 for (const SDNode *User : left->users()) {
5018 int orderNo3 = User->getIROrder();
5019 if (orderNo3 > orderNo) {
5020 opIsLive = true;
5021 break;
5022 }
5023 }
5024
5025 if (!opIsLive)
5026 for (const SDNode *User : right->users()) {
5027 int orderNo3 = User->getIROrder();
5028 if (orderNo3 > orderNo) {
5029 opIsLive = true;
5030 break;
5031 }
5032 }
5033
5034 if (!opIsLive)
5035 return SDValue();
5036 }
5037
5038 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5039 N0.getOperand(1), N1);
5040 }
5041
5042 return SDValue();
5043}
5044
5045/// Fold unpacking movs into a load by increasing the number of return values.
5046///
5047/// ex:
5048/// L: v2f16,ch = load <p>
5049/// a: f16 = extractelt L:0, 0
5050/// b: f16 = extractelt L:0, 1
5051/// use(a, b)
5052///
5053/// ...is turned into...
5054///
5055/// L: f16,f16,ch = LoadV2 <p>
5056/// use(L:0, L:1)
5057static SDValue
5059 // Don't run this optimization before the legalizer
5060 if (!DCI.isAfterLegalizeDAG())
5061 return SDValue();
5062
5063 EVT ElementVT = N->getValueType(0);
5064 // Avoid non-packed types and v4i8
5065 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5066 return SDValue();
5067
5068 SmallVector<SDNode *> DeadCopyToRegs;
5069
5070 // Check whether all outputs are either used by an extractelt or are
5071 // glue/chain nodes
5072 if (!all_of(N->uses(), [&](SDUse &U) {
5073 // Skip glue, chain nodes
5074 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5075 return true;
5076 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5077 if (N->getOpcode() != ISD::LOAD)
5078 return true;
5079 // Since this is an ISD::LOAD, check all extractelts are used. If
5080 // any are not used, we don't want to defeat another optimization that
5081 // will narrow the load.
5082 //
5083 // For example:
5084 //
5085 // L: v2f16,ch = load <p>
5086 // e0: f16 = extractelt L:0, 0
5087 // e1: f16 = extractelt L:0, 1 <-- unused
5088 // store e0
5089 //
5090 // Can be optimized by DAGCombiner to:
5091 //
5092 // L: f16,ch = load <p>
5093 // store L:0
5094 return !U.getUser()->use_empty();
5095 }
5096
5097 // Otherwise, this use prevents us from splitting a value.
5098 return false;
5099 }))
5100 return SDValue();
5101
5102 auto *LD = cast<MemSDNode>(N);
5103 SDLoc DL(LD);
5104
5105 // the new opcode after we double the number of operands
5106 NVPTXISD::NodeType Opcode;
5108 unsigned OldNumOutputs; // non-glue, non-chain outputs
5109 switch (LD->getOpcode()) {
5110 case ISD::LOAD:
5111 OldNumOutputs = 1;
5112 // Any packed type is legal, so the legalizer will not have lowered
5113 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5114 // here.
5115 Opcode = NVPTXISD::LoadV2;
5116 Operands.push_back(DCI.DAG.getIntPtrConstant(
5117 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5118 break;
5119 case NVPTXISD::LoadV2:
5120 OldNumOutputs = 2;
5121 Opcode = NVPTXISD::LoadV4;
5122 break;
5123 case NVPTXISD::LoadV4:
5124 // V8 is only supported for f32. Don't forget, we're not changing the load
5125 // size here. This is already a 256-bit load.
5126 if (ElementVT != MVT::v2f32)
5127 return SDValue();
5128 OldNumOutputs = 4;
5129 Opcode = NVPTXISD::LoadV8;
5130 break;
5131 case NVPTXISD::LoadV8:
5132 // PTX doesn't support the next doubling of outputs
5133 return SDValue();
5134 }
5135
5136 // the non-glue, non-chain outputs in the new load
5137 const unsigned NewNumOutputs = OldNumOutputs * 2;
5138 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5139 // add remaining chain and glue values
5140 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5141
5142 // Create the new load
5143 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5144 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5145 LD->getMemOperand());
5146
5147 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5148 // the outputs the same. These nodes will be optimized away in later
5149 // DAGCombiner iterations.
5151 for (unsigned I : seq(OldNumOutputs))
5152 Results.push_back(DCI.DAG.getBuildVector(
5153 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5154 // Add remaining chain and glue nodes
5155 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5156 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5157
5158 return DCI.DAG.getMergeValues(Results, DL);
5159}
5160
5161/// Fold packing movs into a store.
5162///
5163/// ex:
5164/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5165/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5166/// StoreV2 v1, v2
5167///
5168/// ...is turned into...
5169///
5170/// StoreV4 a, b, c, d
5173 unsigned Front, unsigned Back) {
5174 // We want to run this as late as possible since other optimizations may
5175 // eliminate the BUILD_VECTORs.
5176 if (!DCI.isAfterLegalizeDAG())
5177 return SDValue();
5178
5179 // Get the type of the operands being stored.
5180 EVT ElementVT = N->getOperand(Front).getValueType();
5181
5182 // Avoid non-packed types and v4i8
5183 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5184 return SDValue();
5185
5186 auto *ST = cast<MemSDNode>(N);
5187
5188 // The new opcode after we double the number of operands.
5189 NVPTXISD::NodeType Opcode;
5190 switch (N->getOpcode()) {
5191 case ISD::STORE:
5192 // Any packed type is legal, so the legalizer will not have lowered
5193 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5194 // it here.
5195 Opcode = NVPTXISD::StoreV2;
5196 break;
5197 case NVPTXISD::StoreV2:
5198 Opcode = NVPTXISD::StoreV4;
5199 break;
5200 case NVPTXISD::StoreV4:
5201 // V8 is only supported for f32. Don't forget, we're not changing the store
5202 // size here. This is already a 256-bit store.
5203 if (ElementVT != MVT::v2f32)
5204 return SDValue();
5205 Opcode = NVPTXISD::StoreV8;
5206 break;
5207 case NVPTXISD::StoreV8:
5208 // PTX doesn't support the next doubling of operands
5209 return SDValue();
5210 default:
5211 llvm_unreachable("Unhandled store opcode");
5212 }
5213
5214 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5215 // their elements.
5216 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5217 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5218 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5219 return SDValue();
5220
5221 // If the operand has multiple uses, this optimization can increase register
5222 // pressure.
5223 if (!BV.hasOneUse())
5224 return SDValue();
5225
5226 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5227 // any signs they may be folded by some other pattern or rule.
5228 for (SDValue Op : BV->ops()) {
5229 // Peek through bitcasts
5230 if (Op.getOpcode() == ISD::BITCAST)
5231 Op = Op.getOperand(0);
5232
5233 // This may be folded into a PRMT.
5234 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5235 Op->getOperand(0).getValueType() == MVT::i32)
5236 return SDValue();
5237
5238 // This may be folded into cvt.bf16x2
5239 if (Op.getOpcode() == ISD::FP_ROUND)
5240 return SDValue();
5241 }
5242 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5243 }
5244 Operands.append(N->op_end() - Back, N->op_end());
5245
5246 // Now we replace the store
5247 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5248 ST->getMemoryVT(), ST->getMemOperand());
5249}
5250
5252 const NVPTXSubtarget &STI) {
5253
5254 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5255 // Here is our chance to custom lower a store with a non-simple type.
5256 // Unfortunately, we can't do this in the legalizer because there is no
5257 // way to setOperationAction for an non-simple type.
5259 if (!ST->getValue().getValueType().isSimple())
5260 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
5261 }
5262
5263 return combinePackingMovIntoStore(N, DCI, 1, 2);
5264}
5265
5267 const NVPTXSubtarget &STI) {
5268 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
5269 // Here is our chance to custom lower a load with a non-simple type.
5270 // Unfortunately, we can't do this in the legalizer because there is no
5271 // way to setOperationAction for an non-simple type.
5272 if (!N->getValueType(0).isSimple())
5273 return lowerLoadVector(N, DCI.DAG, STI);
5274 }
5275
5276 return combineUnpackingMovIntoLoad(N, DCI);
5277}
5278
5279/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5280///
5283 CodeGenOptLevel OptLevel) {
5284 if (OptLevel == CodeGenOptLevel::None)
5285 return SDValue();
5286
5287 SDValue N0 = N->getOperand(0);
5288 SDValue N1 = N->getOperand(1);
5289
5290 // Skip non-integer, non-scalar case
5291 EVT VT = N0.getValueType();
5292 if (VT.isVector() || VT != MVT::i32)
5293 return SDValue();
5294
5295 // First try with the default operand order.
5296 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5297 return Result;
5298
5299 // If that didn't work, try again with the operands commuted.
5300 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5301}
5302
5303/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5304///
5307 CodeGenOptLevel OptLevel) {
5308 SDValue N0 = N->getOperand(0);
5309 SDValue N1 = N->getOperand(1);
5310
5311 EVT VT = N0.getValueType();
5312 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5313 return SDValue();
5314
5315 // First try with the default operand order.
5316 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5317 return Result;
5318
5319 // If that didn't work, try again with the operands commuted.
5320 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5321}
5322
5323/// Get 3-input version of a 2-input min/max opcode
5324static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode) {
5325 switch (MinMax2Opcode) {
5326 case ISD::FMAXNUM:
5327 case ISD::FMAXIMUMNUM:
5328 return NVPTXISD::FMAXNUM3;
5329 case ISD::FMINNUM:
5330 case ISD::FMINIMUMNUM:
5331 return NVPTXISD::FMINNUM3;
5332 case ISD::FMAXIMUM:
5333 return NVPTXISD::FMAXIMUM3;
5334 case ISD::FMINIMUM:
5335 return NVPTXISD::FMINIMUM3;
5336 default:
5337 llvm_unreachable("Invalid 2-input min/max opcode");
5338 }
5339}
5340
5341/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
5342/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
5345 unsigned PTXVersion, unsigned SmVersion) {
5346
5347 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
5348 EVT VT = N->getValueType(0);
5349 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
5350 return SDValue();
5351
5352 SDValue Op0 = N->getOperand(0);
5353 SDValue Op1 = N->getOperand(1);
5354 unsigned MinMaxOp2 = N->getOpcode();
5355 NVPTXISD::NodeType MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
5356
5357 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
5358 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
5359 SDValue A = Op0.getOperand(0);
5360 SDValue B = Op0.getOperand(1);
5361 SDValue C = Op1;
5362 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5363 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
5364 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
5365 SDValue A = Op0;
5366 SDValue B = Op1.getOperand(0);
5367 SDValue C = Op1.getOperand(1);
5368 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5369 }
5370 return SDValue();
5371}
5372
5375 CodeGenOptLevel OptLevel) {
5376 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5377
5378 // Don't do anything at less than -O2.
5379 if (OptLevel < CodeGenOptLevel::Default)
5380 return SDValue();
5381
5382 SelectionDAG &DAG = DCI.DAG;
5383 SDLoc DL(N);
5384 EVT VT = N->getValueType(0);
5385 bool IsSigned = N->getOpcode() == ISD::SREM;
5386 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5387
5388 const SDValue &Num = N->getOperand(0);
5389 const SDValue &Den = N->getOperand(1);
5390
5391 for (const SDNode *U : Num->users()) {
5392 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5393 U->getOperand(1) == Den) {
5394 // Num % Den -> Num - (Num / Den) * Den
5395 return DAG.getNode(ISD::SUB, DL, VT, Num,
5396 DAG.getNode(ISD::MUL, DL, VT,
5397 DAG.getNode(DivOpc, DL, VT, Num, Den),
5398 Den));
5399 }
5400 }
5401 return SDValue();
5402}
5403
5404// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
5406 CodeGenOptLevel OptLevel) {
5407 if (OptLevel == CodeGenOptLevel::None)
5408 return SDValue();
5409
5410 SDValue Op = N->getOperand(0);
5411 if (!Op.hasOneUse())
5412 return SDValue();
5413 EVT ToVT = N->getValueType(0);
5414 EVT FromVT = Op.getValueType();
5415 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
5416 (ToVT == MVT::i64 && FromVT == MVT::i32)))
5417 return SDValue();
5418 if (!(Op.getOpcode() == ISD::MUL ||
5419 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
5420 return SDValue();
5421
5422 SDLoc DL(N);
5423 unsigned ExtOpcode = N->getOpcode();
5424 unsigned Opcode = 0;
5425 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
5427 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
5429 else
5430 return SDValue();
5431 SDValue RHS = Op.getOperand(1);
5432 if (Op.getOpcode() == ISD::SHL) {
5433 const auto ShiftAmt = Op.getConstantOperandVal(1);
5434 const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
5435 RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
5436 }
5437 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
5438}
5439
5445
5446/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5447/// that can be demoted to \p OptSize bits without loss of information. The
5448/// signedness of the operand, if determinable, is placed in \p S.
5450 unsigned OptSize,
5451 OperandSignedness &S) {
5452 S = Unknown;
5453
5454 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5455 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5456 EVT OrigVT = Op.getOperand(0).getValueType();
5457 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5458 S = Signed;
5459 return true;
5460 }
5461 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5462 EVT OrigVT = Op.getOperand(0).getValueType();
5463 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5464 S = Unsigned;
5465 return true;
5466 }
5467 }
5468
5469 return false;
5470}
5471
5472/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5473/// be demoted to \p OptSize bits without loss of information. If the operands
5474/// contain a constant, it should appear as the RHS operand. The signedness of
5475/// the operands is placed in \p IsSigned.
5477 unsigned OptSize,
5478 bool &IsSigned) {
5479 OperandSignedness LHSSign;
5480
5481 // The LHS operand must be a demotable op
5482 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5483 return false;
5484
5485 // We should have been able to determine the signedness from the LHS
5486 if (LHSSign == Unknown)
5487 return false;
5488
5489 IsSigned = (LHSSign == Signed);
5490
5491 // The RHS can be a demotable op or a constant
5493 const APInt &Val = CI->getAPIntValue();
5494 if (LHSSign == Unsigned) {
5495 return Val.isIntN(OptSize);
5496 } else {
5497 return Val.isSignedIntN(OptSize);
5498 }
5499 } else {
5500 OperandSignedness RHSSign;
5501 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5502 return false;
5503
5504 return LHSSign == RHSSign;
5505 }
5506}
5507
5508/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5509/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5510/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5511/// amount.
5514 EVT MulType = N->getValueType(0);
5515 if (MulType != MVT::i32 && MulType != MVT::i64) {
5516 return SDValue();
5517 }
5518
5519 SDLoc DL(N);
5520 unsigned OptSize = MulType.getSizeInBits() >> 1;
5521 SDValue LHS = N->getOperand(0);
5522 SDValue RHS = N->getOperand(1);
5523
5524 // Canonicalize the multiply so the constant (if any) is on the right
5525 if (N->getOpcode() == ISD::MUL) {
5526 if (isa<ConstantSDNode>(LHS)) {
5527 std::swap(LHS, RHS);
5528 }
5529 }
5530
5531 // If we have a SHL, determine the actual multiply amount
5532 if (N->getOpcode() == ISD::SHL) {
5534 if (!ShlRHS) {
5535 return SDValue();
5536 }
5537
5538 APInt ShiftAmt = ShlRHS->getAPIntValue();
5539 unsigned BitWidth = MulType.getSizeInBits();
5540 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5541 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5542 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5543 } else {
5544 return SDValue();
5545 }
5546 }
5547
5548 bool Signed;
5549 // Verify that our operands are demotable
5550 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5551 return SDValue();
5552 }
5553
5554 EVT DemotedVT;
5555 if (MulType == MVT::i32) {
5556 DemotedVT = MVT::i16;
5557 } else {
5558 DemotedVT = MVT::i32;
5559 }
5560
5561 // Truncate the operands to the correct size. Note that these are just for
5562 // type consistency and will (likely) be eliminated in later phases.
5563 SDValue TruncLHS =
5564 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5565 SDValue TruncRHS =
5566 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5567
5568 unsigned Opc;
5569 if (Signed) {
5571 } else {
5573 }
5574
5575 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5576}
5577
5578static bool isConstOne(const SDValue &Operand) {
5579 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5580 return Const && Const->getZExtValue() == 1;
5581}
5582
5584 if (Add->getOpcode() != ISD::ADD)
5585 return SDValue();
5586
5587 if (isConstOne(Add->getOperand(0)))
5588 return Add->getOperand(1);
5589
5590 if (isConstOne(Add->getOperand(1)))
5591 return Add->getOperand(0);
5592
5593 return SDValue();
5594}
5595
5598
5600 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5601 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
5602 }
5603
5604 return SDValue();
5605}
5606
5608 SDLoc DL,
5610 if (Select->getOpcode() != ISD::SELECT)
5611 return SDValue();
5612
5613 SDValue Cond = Select->getOperand(0);
5614
5615 unsigned ConstOpNo;
5616 if (isConstOne(Select->getOperand(1)))
5617 ConstOpNo = 1;
5618 else if (isConstOne(Select->getOperand(2)))
5619 ConstOpNo = 2;
5620 else
5621 return SDValue();
5622
5623 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5624
5625 // Do not combine if the resulting sequence is not obviously profitable.
5627 return SDValue();
5628
5629 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5630
5631 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5632 (ConstOpNo == 1) ? X : NewMul,
5633 (ConstOpNo == 1) ? NewMul : X);
5634}
5635
5636static SDValue
5639
5640 EVT VT = N0.getValueType();
5641 if (VT.isVector())
5642 return SDValue();
5643
5644 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5645 return SDValue();
5646
5647 SDLoc DL(N);
5648
5649 // (mul x, (add y, 1)) -> (add (mul x, y), x)
5650 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5651 return Res;
5652 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5653 return Res;
5654
5655 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5656 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5657 return Res;
5658 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5659 return Res;
5660
5661 return SDValue();
5662}
5663
5664/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5667 CodeGenOptLevel OptLevel) {
5668 if (OptLevel == CodeGenOptLevel::None)
5669 return SDValue();
5670
5671 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5672 return Ret;
5673
5674 SDValue N0 = N->getOperand(0);
5675 SDValue N1 = N->getOperand(1);
5676 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5677}
5678
5679/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5682 CodeGenOptLevel OptLevel) {
5683 if (OptLevel > CodeGenOptLevel::None) {
5684 // Try mul.wide combining at OptLevel > 0
5685 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5686 return Ret;
5687 }
5688
5689 return SDValue();
5690}
5691
5694 unsigned int SmVersion) {
5695 EVT CCType = N->getValueType(0);
5696 SDValue A = N->getOperand(0);
5697 SDValue B = N->getOperand(1);
5698
5699 EVT AType = A.getValueType();
5700 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5701 return SDValue();
5702
5703 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5704 return SDValue();
5705
5706 SDLoc DL(N);
5707 // setp.f16x2 returns two scalar predicates, which we need to
5708 // convert back to v2i1. The returned result will be scalarized by
5709 // the legalizer, but the comparison will remain a single vector
5710 // instruction.
5711 SDValue CCNode = DCI.DAG.getNode(
5712 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5714 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5715 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5716 CCNode.getValue(1));
5717}
5718
5721 SDValue Vector = N->getOperand(0);
5722 if (Vector->getOpcode() == ISD::FREEZE)
5723 Vector = Vector->getOperand(0);
5724 SDLoc DL(N);
5725 EVT VectorVT = Vector.getValueType();
5726 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5727 IsPTXVectorType(VectorVT.getSimpleVT()))
5728 return SDValue(); // Native vector loads already combine nicely w/
5729 // extract_vector_elt.
5730 // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8),
5731 // we already handle them OK.
5732 if (VectorVT.getVectorNumElements() == 1 ||
5733 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
5734 return SDValue();
5735
5736 // Don't mess with undef values as sra may be simplified to 0, not undef.
5737 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5738 return SDValue();
5739
5740 uint64_t VectorBits = VectorVT.getSizeInBits();
5741 // We only handle the types we can extract in-register.
5742 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5743 return SDValue();
5744
5745 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5746 // Index == 0 is handled by generic DAG combiner.
5747 if (!Index || Index->getZExtValue() == 0)
5748 return SDValue();
5749
5750 MVT IVT = MVT::getIntegerVT(VectorBits);
5751 EVT EltVT = VectorVT.getVectorElementType();
5752 EVT EltIVT = EltVT.changeTypeToInteger();
5753 uint64_t EltBits = EltVT.getScalarSizeInBits();
5754
5755 SDValue Result = DCI.DAG.getNode(
5756 ISD::TRUNCATE, DL, EltIVT,
5757 DCI.DAG.getNode(
5758 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5759 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5760
5761 // If element has non-integer type, bitcast it back to the expected type.
5762 if (EltVT != EltIVT)
5763 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5764 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5765 if (EltVT != N->getValueType(0))
5766 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5767
5768 return Result;
5769}
5770
5773 SDValue VA = N->getOperand(1);
5774 EVT VectorVT = VA.getValueType();
5775 if (VectorVT != MVT::v4i8)
5776 return SDValue();
5777
5778 // We need to split vselect into individual per-element operations Because we
5779 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5780 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5781 // to/from i16 normally used for i8 values.
5783 SDLoc DL(N);
5784 SDValue VCond = N->getOperand(0);
5785 SDValue VB = N->getOperand(2);
5786 for (int I = 0; I < 4; ++I) {
5787 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5788 DCI.DAG.getConstant(I, DL, MVT::i32));
5789 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5790 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5791 DCI.DAG.getConstant(I, DL, MVT::i32)),
5792 DL, MVT::i32);
5793 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5794 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5795 DCI.DAG.getConstant(I, DL, MVT::i32)),
5796 DL, MVT::i32);
5797 E.push_back(DCI.DAG.getAnyExtOrTrunc(
5798 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5799 }
5800 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5801}
5802
5803static SDValue
5805 auto VT = N->getValueType(0);
5806 if (!DCI.isAfterLegalizeDAG() ||
5807 // only process v2*16 types
5808 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
5809 VT.getVectorNumElements() == 2))
5810 return SDValue();
5811
5812 auto Op0 = N->getOperand(0);
5813 auto Op1 = N->getOperand(1);
5814
5815 // Start out by assuming we want to take the lower 2 bytes of each i32
5816 // operand.
5817 uint64_t Op0Bytes = 0x10;
5818 uint64_t Op1Bytes = 0x54;
5819
5820 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
5821 {&Op1, &Op1Bytes}};
5822
5823 // Check that each operand is an i16, truncated from an i32 operand. We'll
5824 // select individual bytes from those original operands. Optionally, fold in a
5825 // shift right of that original operand.
5826 for (auto &[Op, OpBytes] : OpData) {
5827 // Eat up any bitcast
5828 if (Op->getOpcode() == ISD::BITCAST)
5829 *Op = Op->getOperand(0);
5830
5831 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
5832 Op->getOperand(0).getValueType() == MVT::i32))
5833 return SDValue();
5834
5835 // If the truncate has multiple uses, this optimization can increase
5836 // register pressure
5837 if (!Op->hasOneUse())
5838 return SDValue();
5839
5840 *Op = Op->getOperand(0);
5841
5842 // Optionally, fold in a shift-right of the original operand and let permute
5843 // pick the two higher bytes of the original value directly.
5844 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
5845 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
5846 // Shift the PRMT byte selector to pick upper bytes from each respective
5847 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
5848 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
5849 "PRMT selector values out of range");
5850 *OpBytes += 0x22;
5851 *Op = Op->getOperand(0);
5852 }
5853 }
5854 }
5855
5856 SDLoc DL(N);
5857 auto &DAG = DCI.DAG;
5858
5859 auto PRMT =
5860 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
5861 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
5862 return DAG.getBitcast(VT, PRMT);
5863}
5864
5867 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
5868
5869 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
5870 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
5871
5872 // Fold asc[B -> A](asc[A -> B](x)) -> x
5873 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
5874 return ASCN2->getOperand(0);
5875 }
5876
5877 return SDValue();
5878}
5879
5880// Given a constant selector value and a prmt mode, return the selector value
5881// normalized to the generic prmt mode. See the PTX ISA documentation for more
5882// details:
5883// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
5884static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
5885 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
5886
5888 return Selector;
5889
5890 const unsigned V = Selector.trunc(2).getZExtValue();
5891
5892 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
5893 unsigned S3) {
5894 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
5895 };
5896
5897 switch (Mode) {
5899 return GetSelector(V, V + 1, V + 2, V + 3);
5901 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
5903 return GetSelector(V, V, V, V);
5905 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
5907 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
5909 unsigned V1 = (V & 1) << 1;
5910 return GetSelector(V1, V1 + 1, V1, V1 + 1);
5911 }
5912 default:
5913 llvm_unreachable("Invalid PRMT mode");
5914 }
5915}
5916
5917static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
5918 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
5919 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
5920 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
5921 APInt BitField = B.concat(A);
5922 APInt SelectorVal = getPRMTSelector(Selector, Mode);
5923 APInt Result(32, 0);
5924 for (unsigned I : llvm::seq(4U)) {
5925 APInt Sel = SelectorVal.extractBits(4, I * 4);
5926 unsigned Idx = Sel.getLoBits(3).getZExtValue();
5927 unsigned Sign = Sel.getHiBits(1).getZExtValue();
5928 APInt Byte = BitField.extractBits(8, Idx * 8);
5929 if (Sign)
5930 Byte = Byte.ashr(8);
5931 Result.insertBits(Byte, I * 8);
5932 }
5933 return Result;
5934}
5935
5937 CodeGenOptLevel OptLevel) {
5938 if (OptLevel == CodeGenOptLevel::None)
5939 return SDValue();
5940
5941 // Constant fold PRMT
5942 if (isa<ConstantSDNode>(N->getOperand(0)) &&
5943 isa<ConstantSDNode>(N->getOperand(1)) &&
5944 isa<ConstantSDNode>(N->getOperand(2)))
5945 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
5946 N->getConstantOperandAPInt(1),
5947 N->getConstantOperandAPInt(2),
5948 N->getConstantOperandVal(3)),
5949 SDLoc(N), N->getValueType(0));
5950 return SDValue();
5951}
5952
5953// During call lowering we wrap the return values in a ProxyReg node which
5954// depend on the chain value produced by the completed call. This ensures that
5955// the full call is emitted in cases where libcalls are used to legalize
5956// operations. To improve the functioning of other DAG combines we pull all
5957// operations we can through one of these nodes, ensuring that the ProxyReg
5958// directly wraps a load. That is:
5959//
5960// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
5961//
5964 switch (R.getOpcode()) {
5965 case ISD::TRUNCATE:
5966 case ISD::ANY_EXTEND:
5967 case ISD::SIGN_EXTEND:
5968 case ISD::ZERO_EXTEND:
5969 case ISD::BITCAST: {
5970 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
5971 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
5972 return SDValue();
5973 }
5974 case ISD::SHL:
5975 case ISD::SRL:
5976 case ISD::SRA:
5977 case ISD::OR: {
5978 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
5979 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
5980 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
5981 return SDValue();
5982 }
5983 case ISD::Constant:
5984 return R;
5985 case ISD::LOAD:
5986 case NVPTXISD::LoadV2:
5987 case NVPTXISD::LoadV4: {
5988 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
5989 {Chain, R});
5990 }
5991 case ISD::BUILD_VECTOR: {
5992 if (DCI.isBeforeLegalize())
5993 return SDValue();
5994
5996 for (auto &Op : R->ops()) {
5997 SDValue V = sinkProxyReg(Op, Chain, DCI);
5998 if (!V)
5999 return SDValue();
6000 Ops.push_back(V);
6001 }
6002 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6003 }
6005 if (DCI.isBeforeLegalize())
6006 return SDValue();
6007
6008 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6010 R.getValueType(), V, R.getOperand(1));
6011 return SDValue();
6012 }
6013 default:
6014 return SDValue();
6015 }
6016}
6017
6020
6021 SDValue Chain = N->getOperand(0);
6022 SDValue Reg = N->getOperand(1);
6023
6024 // If the ProxyReg is not wrapping a load, try to pull the operations through
6025 // the ProxyReg.
6026 if (Reg.getOpcode() != ISD::LOAD) {
6027 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
6028 return V;
6029 }
6030
6031 return SDValue();
6032}
6033
6034SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6035 DAGCombinerInfo &DCI) const {
6037 switch (N->getOpcode()) {
6038 default:
6039 break;
6040 case ISD::ADD:
6041 return PerformADDCombine(N, DCI, OptLevel);
6042 case ISD::ADDRSPACECAST:
6043 return combineADDRSPACECAST(N, DCI);
6044 case ISD::SIGN_EXTEND:
6045 case ISD::ZERO_EXTEND:
6046 return combineMulWide(N, DCI, OptLevel);
6047 case ISD::BUILD_VECTOR:
6048 return PerformBUILD_VECTORCombine(N, DCI);
6050 return PerformEXTRACTCombine(N, DCI);
6051 case ISD::FADD:
6052 return PerformFADDCombine(N, DCI, OptLevel);
6053 case ISD::FMAXNUM:
6054 case ISD::FMINNUM:
6055 case ISD::FMAXIMUM:
6056 case ISD::FMINIMUM:
6057 case ISD::FMAXIMUMNUM:
6058 case ISD::FMINIMUMNUM:
6059 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
6060 STI.getSmVersion());
6061 case ISD::LOAD:
6062 case NVPTXISD::LoadV2:
6063 case NVPTXISD::LoadV4:
6064 return combineLOAD(N, DCI, STI);
6065 case ISD::MUL:
6066 return PerformMULCombine(N, DCI, OptLevel);
6067 case NVPTXISD::PRMT:
6068 return combinePRMT(N, DCI, OptLevel);
6069 case NVPTXISD::ProxyReg:
6070 return combineProxyReg(N, DCI);
6071 case ISD::SETCC:
6072 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6073 case ISD::SHL:
6074 return PerformSHLCombine(N, DCI, OptLevel);
6075 case ISD::SREM:
6076 case ISD::UREM:
6077 return PerformREMCombine(N, DCI, OptLevel);
6078 case ISD::STORE:
6079 case NVPTXISD::StoreV2:
6080 case NVPTXISD::StoreV4:
6081 return combineSTORE(N, DCI, STI);
6082 case ISD::VSELECT:
6083 return PerformVSELECTCombine(N, DCI);
6084 }
6085 return SDValue();
6086}
6087
6090 // Handle bitcasting to v2i8 without hitting the default promotion
6091 // strategy which goes through stack memory.
6092 SDValue Op(Node, 0);
6093 EVT ToVT = Op->getValueType(0);
6094 if (ToVT != MVT::v2i8) {
6095 return;
6096 }
6097
6098 // Bitcast to i16 and unpack elements into a vector
6099 SDLoc DL(Node);
6100 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6101 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6102 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6103 SDValue Vec1 =
6104 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6105 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6106 Results.push_back(
6107 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
6108}
6109
6110// Lower vector return type of tcgen05.ld intrinsics
6113 bool hasOffset = false) {
6114 SDLoc DL(N);
6115 EVT ResVT = N->getValueType(0);
6116 if (!ResVT.isVector())
6117 return; // already legalized.
6118
6119 const unsigned NumElts = ResVT.getVectorNumElements();
6120
6121 // Create the return type of the instructions
6122 SmallVector<EVT, 5> ListVTs;
6123 for (unsigned i = 0; i < NumElts; ++i)
6124 ListVTs.push_back(MVT::i32);
6125
6126 ListVTs.push_back(N->getValueType(1)); // Chain
6127
6128 SDVTList ResVTs = DAG.getVTList(ListVTs);
6129
6130 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
6131 N->getOperand(2)};
6132
6133 if (hasOffset) {
6134 Ops.push_back(N->getOperand(3)); // offset
6135 Ops.push_back(N->getOperand(4)); // Pack flag
6136 } else
6137 Ops.push_back(N->getOperand(3)); // Pack flag
6138
6140 SDValue NewNode =
6142 MemSD->getMemoryVT(), MemSD->getMemOperand());
6143
6144 // split the vector result
6145 SmallVector<SDValue, 4> ScalarRes;
6146 for (unsigned i = 0; i < NumElts; ++i) {
6147 SDValue Res = NewNode.getValue(i);
6148 ScalarRes.push_back(Res);
6149 }
6150
6151 SDValue Chain = NewNode.getValue(NumElts);
6152 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
6153 Results.push_back(BuildVector); // Build Vector
6154 Results.push_back(Chain); // Chain
6155}
6156
6159 SDValue Chain = N->getOperand(0);
6160 SDValue Intrin = N->getOperand(1);
6161 SDLoc DL(N);
6162
6163 // Get the intrinsic ID
6164 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6165 switch (IntrinNo) {
6166 default:
6167 return;
6168 case Intrinsic::nvvm_ldu_global_i:
6169 case Intrinsic::nvvm_ldu_global_f:
6170 case Intrinsic::nvvm_ldu_global_p: {
6171 EVT ResVT = N->getValueType(0);
6172
6173 if (ResVT.isVector()) {
6174 // Vector LDG/LDU
6175
6176 unsigned NumElts = ResVT.getVectorNumElements();
6177 EVT EltVT = ResVT.getVectorElementType();
6178
6179 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6180 // legalization.
6181 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6182 // loaded type to i16 and propagate the "real" type as the memory type.
6183 bool NeedTrunc = false;
6184 if (EltVT.getSizeInBits() < 16) {
6185 EltVT = MVT::i16;
6186 NeedTrunc = true;
6187 }
6188
6189 unsigned Opcode = 0;
6190 SDVTList LdResVTs;
6191
6192 switch (NumElts) {
6193 default:
6194 return;
6195 case 2:
6196 Opcode = NVPTXISD::LDUV2;
6197 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6198 break;
6199 case 4: {
6200 Opcode = NVPTXISD::LDUV4;
6201 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6202 LdResVTs = DAG.getVTList(ListVTs);
6203 break;
6204 }
6205 }
6206
6207 SmallVector<SDValue, 8> OtherOps;
6208
6209 // Copy regular operands
6210
6211 OtherOps.push_back(Chain); // Chain
6212 // Skip operand 1 (intrinsic ID)
6213 // Others
6214 OtherOps.append(N->op_begin() + 2, N->op_end());
6215
6217
6218 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6219 MemSD->getMemoryVT(),
6220 MemSD->getMemOperand());
6221
6222 SmallVector<SDValue, 4> ScalarRes;
6223
6224 for (unsigned i = 0; i < NumElts; ++i) {
6225 SDValue Res = NewLD.getValue(i);
6226 if (NeedTrunc)
6227 Res =
6228 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6229 ScalarRes.push_back(Res);
6230 }
6231
6232 SDValue LoadChain = NewLD.getValue(NumElts);
6233
6234 SDValue BuildVec =
6235 DAG.getBuildVector(ResVT, DL, ScalarRes);
6236
6237 Results.push_back(BuildVec);
6238 Results.push_back(LoadChain);
6239 } else {
6240 // i8 LDG/LDU
6241 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6242 "Custom handling of non-i8 ldu/ldg?");
6243
6244 // Just copy all operands as-is
6246
6247 // Force output to i16
6248 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6249
6251
6252 // We make sure the memory type is i8, which will be used during isel
6253 // to select the proper instruction.
6254 SDValue NewLD =
6256 MVT::i8, MemSD->getMemOperand());
6257
6258 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6259 NewLD.getValue(0)));
6260 Results.push_back(NewLD.getValue(1));
6261 }
6262 return;
6263 }
6264
6265 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
6266 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
6267 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
6268 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
6269 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
6270 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
6271 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
6272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
6273 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
6274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
6275 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
6276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
6277 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
6278 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
6279 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
6280 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
6281 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
6282 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
6283 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
6284 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
6285 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
6286 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
6287 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
6288 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
6289 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
6290 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
6291 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
6292 return ReplaceTcgen05Ld(N, DAG, Results);
6293
6294 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
6295 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
6296 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
6297 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
6298 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
6299 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
6300 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
6301 return ReplaceTcgen05Ld(N, DAG, Results, /* Offset */ true);
6302 }
6303}
6304
6307 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6308 // result so that it can pass the legalization
6309 SDLoc DL(N);
6310 SDValue Chain = N->getOperand(0);
6311 SDValue Reg = N->getOperand(1);
6312 SDValue Glue = N->getOperand(2);
6313
6314 assert(Reg.getValueType() == MVT::i128 &&
6315 "Custom lowering for CopyFromReg with 128-bit reg only");
6316 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6317 N->getValueType(2)};
6318 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6319
6320 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6321 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6322 {NewValue.getValue(0), NewValue.getValue(1)});
6323
6324 Results.push_back(Pair);
6325 Results.push_back(NewValue.getValue(2));
6326 Results.push_back(NewValue.getValue(3));
6327}
6328
6330 const TargetLowering &TLI,
6332 SDValue Chain = N->getOperand(0);
6333 SDValue Reg = N->getOperand(1);
6334
6335 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
6336
6337 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
6338 SDValue NewProxy =
6339 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
6340 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
6341
6342 Results.push_back(Res);
6343}
6344
6346 const NVPTXSubtarget &STI,
6348 assert(N->getValueType(0) == MVT::i128 &&
6349 "Custom lowering for atomic128 only supports i128");
6350
6352 SDLoc dl(N);
6353
6354 if (!STI.hasAtomSwap128()) {
6357 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
6358 "requires target sm_90.",
6359 dl.getDebugLoc()));
6360
6361 Results.push_back(DAG.getUNDEF(MVT::i128));
6362 Results.push_back(AN->getOperand(0)); // Chain
6363 return;
6364 }
6365
6367 Ops.push_back(AN->getOperand(0)); // Chain
6368 Ops.push_back(AN->getOperand(1)); // Ptr
6369 for (const auto &Op : AN->ops().drop_front(2)) {
6370 // Low part
6371 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6372 DAG.getIntPtrConstant(0, dl)));
6373 // High part
6374 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6375 DAG.getIntPtrConstant(1, dl)));
6376 }
6377 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
6380 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
6381 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
6382 AN->getMemOperand());
6383 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
6384 {Result.getValue(0), Result.getValue(1)}));
6385 Results.push_back(Result.getValue(2));
6386}
6387
6388void NVPTXTargetLowering::ReplaceNodeResults(
6390 switch (N->getOpcode()) {
6391 default:
6392 report_fatal_error("Unhandled custom legalization");
6393 case ISD::BITCAST:
6394 ReplaceBITCAST(N, DAG, Results);
6395 return;
6396 case ISD::LOAD:
6397 replaceLoadVector(N, DAG, Results, STI);
6398 return;
6401 return;
6402 case ISD::CopyFromReg:
6404 return;
6405 case NVPTXISD::ProxyReg:
6406 replaceProxyReg(N, DAG, *this, Results);
6407 return;
6408 case ISD::ATOMIC_CMP_SWAP:
6409 case ISD::ATOMIC_SWAP:
6410 replaceAtomicSwap128(N, DAG, STI, Results);
6411 return;
6412 }
6413}
6414
6417 Type *Ty = AI->getValOperand()->getType();
6418
6419 if (AI->isFloatingPointOperation()) {
6421 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6422 STI.getPTXVersion() >= 63)
6424 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6425 STI.getPTXVersion() >= 78)
6427 if (Ty->isFloatTy())
6429 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6431 }
6433 }
6434
6435 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6436 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
6437
6438 switch (AI->getOperation()) {
6439 default:
6442 if (BitWidth == 128)
6448 switch (BitWidth) {
6449 case 8:
6450 case 16:
6452 case 32:
6454 case 64:
6455 if (STI.hasAtomBitwise64())
6458 case 128:
6460 default:
6461 llvm_unreachable("unsupported width encountered");
6462 }
6469 switch (BitWidth) {
6470 case 8:
6471 case 16:
6473 case 32:
6475 case 64:
6476 if (STI.hasAtomMinMax64())
6479 case 128:
6481 default:
6482 llvm_unreachable("unsupported width encountered");
6483 }
6486 switch (BitWidth) {
6487 case 32:
6489 case 8:
6490 case 16:
6491 case 64:
6492 case 128:
6494 default:
6495 llvm_unreachable("unsupported width encountered");
6496 }
6497 }
6498
6500}
6501
6503 const Instruction *I) const {
6504 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6505 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
6506 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
6507 // the memory order using explicit fences around the retry loop.
6508 // The memory order of natively supported CAS operations can be enforced
6509 // by lowering to an atom.cas with the right memory synchronizing effect.
6510 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
6511 // So we also use explicit fences for enforcing memory order for
6512 // seq_cast CAS with natively-supported bitwidths.
6513 return CI &&
6514 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
6515 STI.getMinCmpXchgSizeInBits() ||
6516 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
6517}
6518
6520 const Instruction *I) const {
6521 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6522 bool BitwidthSupportedAndIsSeqCst =
6523 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
6524 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
6525 STI.getMinCmpXchgSizeInBits();
6526 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
6528}
6529
6531 Instruction *Inst,
6532 AtomicOrdering Ord) const {
6533 if (!isa<AtomicCmpXchgInst>(Inst))
6534 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
6535
6536 // Specialize for cmpxchg
6537 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
6538 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
6539 if (isReleaseOrStronger(Ord))
6540 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
6541 ? Ord
6543 SSID);
6544
6545 return nullptr;
6546}
6547
6549 Instruction *Inst,
6550 AtomicOrdering Ord) const {
6551 // Specialize for cmpxchg
6552 if (!isa<AtomicCmpXchgInst>(Inst))
6553 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
6554
6555 auto *CI = cast<AtomicCmpXchgInst>(Inst);
6556 auto CASWidth =
6557 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
6558 SyncScope::ID SSID = CI->getSyncScopeID();
6559 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
6560 if (isAcquireOrStronger(Ord) &&
6562 CASWidth < STI.getMinCmpXchgSizeInBits()))
6563 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
6564
6565 return nullptr;
6566}
6567
6568// Rather than default to SINT when both UINT and SINT are custom, we only
6569// change the opcode when UINT is not legal and SINT is. UINT is preferred when
6570// both are custom since unsigned CVT instructions can lead to slightly better
6571// SASS code with fewer instructions.
6573 EVT ToVT) const {
6574 if (isOperationLegal(Op, ToVT))
6575 return Op;
6576 switch (Op) {
6577 case ISD::FP_TO_UINT:
6579 return ISD::FP_TO_SINT;
6580 break;
6584 break;
6585 case ISD::VP_FP_TO_UINT:
6586 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
6587 return ISD::VP_FP_TO_SINT;
6588 break;
6589 default:
6590 break;
6591 }
6592 return Op;
6593}
6594
6595// Pin NVPTXTargetObjectFile's vtables to this file.
6597
6602
6604 const SelectionDAG &DAG, unsigned Depth) {
6605 SDValue A = Op.getOperand(0);
6606 SDValue B = Op.getOperand(1);
6607 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6608 unsigned Mode = Op.getConstantOperandVal(3);
6609
6610 if (!Selector)
6611 return;
6612
6613 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
6614 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
6615
6616 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6617 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
6618 "PRMT must have i32 operands");
6619 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
6620 KnownBits BitField = BKnown.concat(AKnown);
6621
6622 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
6623 for (unsigned I : llvm::seq(4)) {
6624 APInt Sel = SelectorVal.extractBits(4, I * 4);
6625 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6626 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6627 KnownBits Byte = BitField.extractBits(8, Idx * 8);
6628 if (Sign)
6629 Byte = KnownBits::ashr(Byte, 8);
6630 Known.insertBits(Byte, I * 8);
6631 }
6632}
6633
6634static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
6636
6637 // We can't do anything without knowing the sign bit.
6638 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
6639 if (ExtType == ISD::SEXTLOAD)
6640 return;
6641
6642 // ExtLoading to vector types is weird and may not work well with known bits.
6643 auto DestVT = LD->getValueType(0);
6644 if (DestVT.isVector())
6645 return;
6646
6647 assert(Known.getBitWidth() == DestVT.getSizeInBits());
6648 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
6649 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
6650}
6651
6653 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
6654 const SelectionDAG &DAG, unsigned Depth) const {
6655 Known.resetAll();
6656
6657 switch (Op.getOpcode()) {
6658 case NVPTXISD::PRMT:
6659 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
6660 break;
6661 case NVPTXISD::LoadV2:
6662 case NVPTXISD::LoadV4:
6663 case NVPTXISD::LoadV8:
6665 break;
6666 default:
6667 break;
6668 }
6669}
6670
6671static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6672 const APInt &DemandedBits) {
6673 APInt DemandedLHS = APInt(32, 0);
6674 APInt DemandedRHS = APInt(32, 0);
6675
6676 for (unsigned I : llvm::seq(4)) {
6677 if (DemandedBits.extractBits(8, I * 8).isZero())
6678 continue;
6679
6680 APInt Sel = SelectorVal.extractBits(4, I * 4);
6681 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6682 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6683
6684 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6685 unsigned ByteStart = (Idx % 4) * 8;
6686 if (Sign)
6687 Src.setBit(ByteStart + 7);
6688 else
6689 Src.setBits(ByteStart, ByteStart + 8);
6690 }
6691
6692 return {DemandedLHS, DemandedRHS};
6693}
6694
6695// Replace undef with 0 as this is easier for other optimizations such as
6696// known bits.
6698 if (!Op)
6699 return SDValue();
6700 if (Op.isUndef())
6701 return DAG.getConstant(0, SDLoc(), MVT::i32);
6702 return Op;
6703}
6704
6706 const APInt &DemandedBits,
6707 SelectionDAG &DAG,
6708 const TargetLowering &TLI,
6709 unsigned Depth) {
6710 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
6711 SDValue Op0 = PRMT.getOperand(0);
6712 SDValue Op1 = PRMT.getOperand(1);
6713 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
6714 if (!SelectorConst)
6715 return SDValue();
6716
6717 unsigned Mode = PRMT.getConstantOperandVal(3);
6718 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
6719
6720 // Try to simplify the PRMT to one of the inputs if the used bytes are all
6721 // from the same input in the correct order.
6722 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
6723 const unsigned SelBits = (4 - LeadingBytes) * 4;
6724 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
6725 return Op0;
6726 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
6727 return Op1;
6728
6729 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
6730
6731 // Attempt to avoid multi-use ops if we don't need anything from them.
6732 SDValue DemandedOp0 =
6733 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
6734 SDValue DemandedOp1 =
6735 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
6736
6737 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
6738 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
6739 if ((DemandedOp0 && DemandedOp0 != Op0) ||
6740 (DemandedOp1 && DemandedOp1 != Op1)) {
6741 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
6742 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
6743 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
6744 }
6745
6746 return SDValue();
6747}
6748
6750 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
6751 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
6752 Known.resetAll();
6753
6754 switch (Op.getOpcode()) {
6755 case NVPTXISD::PRMT:
6757 *this, Depth)) {
6758 TLO.CombineTo(Op, Result);
6759 return true;
6760 }
6761 break;
6762 default:
6763 break;
6764 }
6765
6766 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
6767 return false;
6768}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:404
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results, bool hasOffset=false)
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< NVPTXISD::NodeType > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1130
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
This class represents a function call, abstracting a target machine's calling convention.
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:521
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3155
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:379
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED
@ CALL
This node represents a PTX call instruction.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y
@ DeclareScalarParam
These nodes represent a parameter declaration.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ BUILD_VECTOR
This node is similar to ISD::BUILD_VECTOR except that the output may be implicitly bitcast to a scala...
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:251
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:233
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:219
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...