LLVM 23.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
19#include "NVPTXSubtarget.h"
20#include "NVPTXTargetMachine.h"
22#include "NVPTXUtilities.h"
23#include "NVVMProperties.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/StringRef.h"
41#include "llvm/IR/Argument.h"
42#include "llvm/IR/Attributes.h"
43#include "llvm/IR/Constants.h"
44#include "llvm/IR/DataLayout.h"
47#include "llvm/IR/FPEnv.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalValue.h"
50#include "llvm/IR/IRBuilder.h"
51#include "llvm/IR/Instruction.h"
53#include "llvm/IR/IntrinsicsNVPTX.h"
54#include "llvm/IR/Module.h"
55#include "llvm/IR/Type.h"
56#include "llvm/IR/Value.h"
68#include <algorithm>
69#include <cassert>
70#include <cmath>
71#include <cstdint>
72#include <iterator>
73#include <optional>
74#include <string>
75#include <tuple>
76#include <utility>
77#include <vector>
78
79#define DEBUG_TYPE "nvptx-lower"
80
81using namespace llvm;
82
84 "nvptx-sched4reg",
85 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
86
88 "nvptx-fma-level", cl::Hidden,
89 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
90 " 1: do it 2: do it aggressively"),
91 cl::init(2));
92
94 "nvptx-prec-divf32", cl::Hidden,
96 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
98 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
99 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
101 "Use IEEE Compliant F32 div.rnd if available (default)"),
103 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
105
107 "nvptx-prec-sqrtf32", cl::Hidden,
108 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
109 cl::init(true));
110
111/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
112/// does NOT use lg2.approx for log2, so this is disabled by default.
114 "nvptx-approx-log2f32",
115 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
116 cl::init(false));
117
120 const SDNode &N) const {
121 // If nvptx-prec-div32=N is used on the command-line, always honor it
122 if (UsePrecDivF32.getNumOccurrences() > 0)
123 return UsePrecDivF32;
124
125 const SDNodeFlags Flags = N.getFlags();
126 if (Flags.hasApproximateFuncs())
128
130}
131
133 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
134 if (UsePrecSqrtF32.getNumOccurrences() > 0)
135 return UsePrecSqrtF32;
136
137 if (N) {
138 const SDNodeFlags Flags = N->getFlags();
139 if (Flags.hasApproximateFuncs())
140 return false;
141 }
142
143 return true;
144}
145
150
151static bool IsPTXVectorType(MVT VT) {
152 switch (VT.SimpleTy) {
153 default:
154 return false;
155 case MVT::v2i1:
156 case MVT::v4i1:
157 case MVT::v2i8:
158 case MVT::v4i8:
159 case MVT::v8i8: // <2 x i8x4>
160 case MVT::v16i8: // <4 x i8x4>
161 case MVT::v2i16:
162 case MVT::v4i16:
163 case MVT::v8i16: // <4 x i16x2>
164 case MVT::v2i32:
165 case MVT::v4i32:
166 case MVT::v2i64:
167 case MVT::v2f16:
168 case MVT::v4f16:
169 case MVT::v8f16: // <4 x f16x2>
170 case MVT::v2bf16:
171 case MVT::v4bf16:
172 case MVT::v8bf16: // <4 x bf16x2>
173 case MVT::v2f32:
174 case MVT::v4f32:
175 case MVT::v2f64:
176 case MVT::v4i64:
177 case MVT::v4f64:
178 case MVT::v8i32:
179 case MVT::v8f32:
180 case MVT::v16f16: // <8 x f16x2>
181 case MVT::v16bf16: // <8 x bf16x2>
182 case MVT::v16i16: // <8 x i16x2>
183 case MVT::v32i8: // <8 x i8x4>
184 return true;
185 }
186}
187
188// When legalizing vector loads/stores, this function is called, which does two
189// things:
190// 1. Determines Whether the vector is something we want to custom lower,
191// std::nullopt is returned if we do not want to custom lower it.
192// 2. If we do want to handle it, returns two parameters:
193// - unsigned int NumElts - The number of elements in the final vector
194// - EVT EltVT - The type of the elements in the final vector
195static std::optional<std::pair<unsigned int, MVT>>
197 unsigned AddressSpace) {
198 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
199
200 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
201 VectorEVT.getSizeInBits() == 256)
202 return {{4, MVT::i64}};
203
204 if (!VectorEVT.isSimple())
205 return std::nullopt;
206 const MVT VectorVT = VectorEVT.getSimpleVT();
207
208 if (!VectorVT.isVector()) {
209 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
210 return {{2, MVT::i64}};
211 return std::nullopt;
212 }
213
214 const MVT EltVT = VectorVT.getVectorElementType();
215 const unsigned NumElts = VectorVT.getVectorNumElements();
216
217 // The size of the PTX virtual register that holds a packed type.
218 unsigned PackRegSize;
219
220 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
221 // legal. We can (and should) split that into 2 stores of <2 x double> here
222 // but I'm leaving that as a TODO for now.
223 switch (VectorVT.SimpleTy) {
224 default:
225 return std::nullopt;
226
227 case MVT::v4i64:
228 case MVT::v4f64:
229 // This is a "native" vector type iff the address space is global and the
230 // target supports 256-bit loads/stores
231 if (!CanLowerTo256Bit)
232 return std::nullopt;
233 [[fallthrough]];
234 case MVT::v2i8:
235 case MVT::v2i64:
236 case MVT::v2f64:
237 // This is a "native" vector type
238 return std::pair(NumElts, EltVT);
239
240 case MVT::v16f16: // <8 x f16x2>
241 case MVT::v16bf16: // <8 x bf16x2>
242 case MVT::v16i16: // <8 x i16x2>
243 case MVT::v32i8: // <8 x i8x4>
244 // This can be upsized into a "native" vector type iff the address space is
245 // global and the target supports 256-bit loads/stores.
246 if (!CanLowerTo256Bit)
247 return std::nullopt;
248 [[fallthrough]];
249 case MVT::v2i16: // <1 x i16x2>
250 case MVT::v2f16: // <1 x f16x2>
251 case MVT::v2bf16: // <1 x bf16x2>
252 case MVT::v4i8: // <1 x i8x4>
253 case MVT::v4i16: // <2 x i16x2>
254 case MVT::v4f16: // <2 x f16x2>
255 case MVT::v4bf16: // <2 x bf16x2>
256 case MVT::v8i8: // <2 x i8x4>
257 case MVT::v8f16: // <4 x f16x2>
258 case MVT::v8bf16: // <4 x bf16x2>
259 case MVT::v8i16: // <4 x i16x2>
260 case MVT::v16i8: // <4 x i8x4>
261 PackRegSize = 32;
262 break;
263
264 case MVT::v8f32: // <4 x f32x2>
265 case MVT::v8i32: // <4 x i32x2>
266 // This is a "native" vector type iff the address space is global and the
267 // target supports 256-bit loads/stores
268 if (!CanLowerTo256Bit)
269 return std::nullopt;
270 [[fallthrough]];
271 case MVT::v2f32: // <1 x f32x2>
272 case MVT::v4f32: // <2 x f32x2>
273 case MVT::v2i32: // <1 x i32x2>
274 case MVT::v4i32: // <2 x i32x2>
275 if (!STI.hasF32x2Instructions())
276 return std::pair(NumElts, EltVT);
277 PackRegSize = 64;
278 break;
279 }
280
281 // If we reach here, then we can pack 2 or more elements into a single 32-bit
282 // or 64-bit PTX register and treat the vector as a new vector containing
283 // packed elements.
284
285 // Number of elements to pack in one word.
286 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
287
288 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
289}
290
291/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
292/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
293/// the types as required by the calling convention (with special handling for
294/// i8s).
295/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
296/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
297/// LowerCall, and LowerReturn.
298static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
299 LLVMContext &Ctx, CallingConv::ID CallConv,
300 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
302 uint64_t StartingOffset = 0) {
303 SmallVector<EVT, 16> TempVTs;
304 SmallVector<uint64_t, 16> TempOffsets;
305 ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
306 StartingOffset);
307
308 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
309 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
310 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
311
312 // Since we actually can load/store b8, we need to ensure that we'll use
313 // the original sized type for any i8s or i8 vectors.
314 if (VT.getScalarType() == MVT::i8) {
315 if (RegisterVT == MVT::i16)
316 RegisterVT = MVT::i8;
317 else if (RegisterVT == MVT::v2i16)
318 RegisterVT = MVT::v2i8;
319 else
320 assert(RegisterVT == MVT::v4i8 &&
321 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
322 }
323
324 // TODO: This is horribly incorrect for cases where the vector elements are
325 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
326 // has existed for as long as NVPTX has and no one has complained, so we'll
327 // leave it for now.
328 for (unsigned I : seq(NumRegs)) {
329 ValueVTs.push_back(RegisterVT);
330 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
331 }
332 }
333}
334
335// We return an EVT that can hold N VTs
336// If the VT is a vector, the resulting EVT is a flat vector with the same
337// element type as VT's element type.
338static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
339 if (N == 1)
340 return VT;
341
342 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
343 VT.getVectorNumElements() * N)
344 : EVT::getVectorVT(C, VT, N);
345}
346
348 const SDLoc &dl, SelectionDAG &DAG) {
349 if (V.getValueType() == VT) {
350 assert(I == 0 && "Index must be 0 for scalar value");
351 return V;
352 }
353
354 if (!VT.isVector())
355 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
356 DAG.getVectorIdxConstant(I, dl));
357
358 return DAG.getNode(
359 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
361}
362
363template <typename T>
364static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
365 SelectionDAG &DAG, T GetElement) {
366 if (N == 1)
367 return GetElement(0);
368
370 for (const unsigned I : llvm::seq(N)) {
371 SDValue Val = GetElement(I);
372 if (Val.getValueType().isVector())
373 DAG.ExtractVectorElements(Val, Values);
374 else
375 Values.push_back(Val);
376 }
377
378 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
379 Values.size());
380 return DAG.getBuildVector(VT, dl, Values);
381}
382
383/// PromoteScalarIntegerPTX
384/// Used to make sure the arguments/returns are suitable for passing
385/// and promote them to a larger size if they're not.
386///
387/// The promoted type is placed in \p PromoteVT if the function returns true.
389 if (VT.isScalarInteger()) {
390 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
391 default:
393 "Promotion is not suitable for scalars of size larger than 64-bits");
394 case 1:
395 return MVT::i1;
396 case 2:
397 case 4:
398 case 8:
399 return MVT::i8;
400 case 16:
401 return MVT::i16;
402 case 32:
403 return MVT::i32;
404 case 64:
405 return MVT::i64;
406 }
407 }
408 return VT;
409}
410
411// Check whether we can merge loads/stores of some of the pieces of a
412// flattened function parameter or return value into a single vector
413// load/store.
414//
415// The flattened parameter is represented as a list of EVTs and
416// offsets, and the whole structure is aligned to ParamAlignment. This
417// function determines whether we can load/store pieces of the
418// parameter starting at index Idx using a single vectorized op of
419// size AccessSize. If so, it returns the number of param pieces
420// covered by the vector op. Otherwise, it returns 1.
421template <typename T>
423 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
424 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
425
426 // Can't vectorize if param alignment is not sufficient.
427 if (ParamAlignment < AccessSize)
428 return 1;
429 // Can't vectorize if offset is not aligned.
430 if (Offsets[Idx] & (AccessSize - 1))
431 return 1;
432
433 EVT EltVT = ValueVTs[Idx];
434 unsigned EltSize = EltVT.getStoreSize();
435
436 // Element is too large to vectorize.
437 if (EltSize >= AccessSize)
438 return 1;
439
440 unsigned NumElts = AccessSize / EltSize;
441 // Can't vectorize if AccessBytes if not a multiple of EltSize.
442 if (AccessSize != EltSize * NumElts)
443 return 1;
444
445 // We don't have enough elements to vectorize.
446 if (Idx + NumElts > ValueVTs.size())
447 return 1;
448
449 // PTX ISA can only deal with 2- and 4-element vector ops.
450 if (NumElts != 4 && NumElts != 2)
451 return 1;
452
453 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
454 // Types do not match.
455 if (ValueVTs[j] != EltVT)
456 return 1;
457
458 // Elements are not contiguous.
459 if (Offsets[j] - Offsets[j - 1] != EltSize)
460 return 1;
461 }
462 // OK. We can vectorize ValueVTs[i..i+NumElts)
463 return NumElts;
464}
465
466// Computes whether and how we can vectorize the loads/stores of a
467// flattened function parameter or return value.
468//
469// The flattened parameter is represented as the list of ValueVTs and
470// Offsets, and is aligned to ParamAlignment bytes. We return a vector
471// of the same size as ValueVTs indicating how each piece should be
472// loaded/stored (i.e. as a scalar, or as part of a vector
473// load/store).
474template <typename T>
477 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
478 bool IsVAArg = false) {
479 // Set vector size to match ValueVTs and mark all elements as
480 // scalars by default.
481
482 if (IsVAArg)
483 return SmallVector<unsigned>(ValueVTs.size(), 1);
484
485 SmallVector<unsigned, 16> VectorInfo;
486
487 const auto GetNumElts = [&](unsigned I) -> unsigned {
488 for (const unsigned AccessSize : {16, 8, 4, 2}) {
489 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
490 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
491 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
492 "Unexpected vectorization size");
493 if (NumElts != 1)
494 return NumElts;
495 }
496 return 1;
497 };
498
499 // Check what we can vectorize using 128/64/32-bit accesses.
500 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
501 const unsigned NumElts = GetNumElts(I);
502 VectorInfo.push_back(NumElts);
503 I += NumElts;
504 }
505 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
506 ValueVTs.size());
507 return VectorInfo;
508}
509
510// NVPTXTargetLowering Constructor.
512 const NVPTXSubtarget &STI)
513 : TargetLowering(TM, STI), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
514 // always lower memset, memcpy, and memmove intrinsics to load/store
515 // instructions, rather
516 // then generating calls to memset, mempcy or memmove.
520
523
524 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
525 // condition branches.
526 setJumpIsExpensive(true);
527
528 // Wide divides are _very_ slow. Try to reduce the width of the divide if
529 // possible.
530 addBypassSlowDiv(64, 32);
531
532 // By default, use the Source scheduling
533 if (sched4reg)
535 else
537
538 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
539 LegalizeAction NoF16Action) {
540 bool IsOpSupported = STI.allowFP16Math();
541 switch (Op) {
542 // Several FP16 instructions are available on sm_80 only.
543 case ISD::FMINNUM:
544 case ISD::FMAXNUM:
547 case ISD::FMAXIMUM:
548 case ISD::FMINIMUM:
549 case ISD::FMAXIMUMNUM:
550 case ISD::FMINIMUMNUM:
551 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
552 break;
553 case ISD::FEXP2:
554 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
555 break;
556 }
557 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
558 };
559
560 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
561 LegalizeAction NoBF16Action) {
562 bool IsOpSupported = STI.hasNativeBF16Support(Op);
564 Op, VT, IsOpSupported ? Action : NoBF16Action);
565 };
566
567 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
568 LegalizeAction NoI16x2Action) {
569 bool IsOpSupported = false;
570 // instructions are available on sm_90 only
571 switch (Op) {
572 case ISD::ADD:
573 case ISD::SMAX:
574 case ISD::SMIN:
575 case ISD::UMIN:
576 case ISD::UMAX:
577 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
578 break;
579 }
580 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
581 };
582
583 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
584 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
585 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
586 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
587 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
588 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
589 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
591 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
592 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
593 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
594 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
595
596 if (STI.hasF32x2Instructions()) {
597 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
598 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
599 }
600
601 // Conversion to/from FP16/FP16x2 is always legal.
606
608 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
610
611 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
612 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
613
614 // Conversion to/from BFP16/BFP16x2 is always legal.
619
620 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
621 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
622 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
623 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
624
625 // Conversion to/from i16/i16x2 is always legal.
630
635
636 // No support for these operations with v2f32/v2i32
637 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
638 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
639
642 MVT::v2i32, Expand);
643
644 // Need custom lowering in case the index is dynamic.
645 if (STI.hasF32x2Instructions())
646 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
647 Custom);
648
649 // Custom conversions to/from v2i8.
651
652 // Only logical ops can be done on v4i8/v2i32 directly, others must be done
653 // elementwise.
670 {MVT::v4i8, MVT::v2i32}, Expand);
671
672 // Operations not directly supported by NVPTX.
673 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
674 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
675 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
678 }
679
680 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
681 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
682
683 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
684 // For others we will expand to a SHL/SRA pair.
690 setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
691
698
701
703 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
704 Expand);
705
706 if (STI.hasHWROT32()) {
709 Custom);
710 }
711
712 setOperationAction(ISD::BR_JT, MVT::Other, STI.hasBrx() ? Legal : Expand);
714
715 // We want to legalize constant related memmove and memcopy
716 // intrinsics.
718
719 // FP extload/truncstore is not legal in PTX. We need to expand all these.
720 for (auto FloatVTs :
722 for (MVT ValVT : FloatVTs) {
723 for (MVT MemVT : FloatVTs) {
724 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
725 setTruncStoreAction(ValVT, MemVT, Expand);
726 }
727 }
728 }
729
730 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
731 // how they'll be lowered in ISel anyway, and by doing this a little earlier
732 // we allow for more DAG combine opportunities.
733 for (auto IntVTs :
735 for (MVT ValVT : IntVTs)
736 for (MVT MemVT : IntVTs)
737 if (isTypeLegal(ValVT))
738 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
739
740 // PTX does not support load / store predicate registers
742 for (MVT VT : MVT::integer_valuetypes()) {
744 Promote);
745 setTruncStoreAction(VT, MVT::i1, Expand);
746 }
747
748 // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
749 // expansion for these nodes when they are unaligned is incorrect if the
750 // type is a vector.
751 //
752 // TODO: Fix the generic expansion for these nodes found in
753 // TargetLowering::expandUnalignedLoad/Store.
755 MVT::v2i8, Expand);
757 {MVT::v2i8, MVT::v2i16}, Expand);
758 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
759 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
760 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
761
762 // Register custom handling for illegal type loads/stores. We'll try to custom
763 // lower almost all illegal types and logic in the lowering will discard cases
764 // we can't handle.
765 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::i256, MVT::f128},
766 Custom);
768 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
770 Custom);
771
772 // Custom legalization for LDU intrinsics.
773 // TODO: The logic to lower these is not very robust and we should rewrite it.
774 // Perhaps LDU should not be represented as an intrinsic at all.
777 if (IsPTXVectorType(VT))
779
783 MVT::i1, Expand);
784
785 // This is legal in NVPTX
790
791 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
793
794 // TRAP can be lowered to PTX trap
795 setOperationAction(ISD::TRAP, MVT::Other, Legal);
796 // DEBUGTRAP can be lowered to PTX brkpt
798
799 // Support varargs.
804
806 {MVT::i16, MVT::i32, MVT::i64}, Legal);
807 // PTX abs.s is undefined for INT_MIN, so ISD::ABS (which requires
808 // abs(INT_MIN) == INT_MIN) must be expanded. ABS_MIN_POISON matches
809 // PTX abs semantics since INT_MIN input is poison/undefined.
810 setOperationAction(ISD::ABS, {MVT::i16, MVT::i32, MVT::i64}, Expand);
811 setOperationAction(ISD::ABS_MIN_POISON, {MVT::i16, MVT::i32, MVT::i64},
812 Legal);
813
815 Promote);
818
819 setI16x2OperationAction(ISD::ABS_MIN_POISON, MVT::v2i16, Legal, Custom);
820 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
821 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
822 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
823 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
824 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
825 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
826
827 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
828 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
829 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
830 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
831 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
832 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
833
834 // Other arithmetic and logic ops are unsupported.
838 {MVT::v2i16, MVT::v2i32}, Expand);
839
840 // v2i32 is not supported for any arithmetic operations
845 MVT::v2i32, Expand);
846
851 if (STI.getPTXVersion() >= 43) {
856 }
857
859 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
862
863 // PTX does not directly support SELP of i1, so promote to i32 first
865
866 // PTX cannot multiply two i64s in a single instruction.
869
870 // We have some custom DAG combine patterns for these nodes
872 ISD::AND,
874 ISD::FADD,
881 ISD::MUL,
883 ISD::SHL,
884 ISD::SREM,
885 ISD::UREM,
889 ISD::LOAD,
894
895 // If the vector operands require register coalescing, scalarize instead
896 if (STI.hasF32x2Instructions())
898
899 // setcc for f16x2 and bf16x2 needs special handling to prevent
900 // legalizer's attempt to scalarize it due to v2i1 not being legal.
901 if (STI.allowFP16Math() || STI.hasBF16Math())
903
904 // Vector reduction operations. These may be turned into shuffle or tree
905 // reductions depending on what instructions are available for each type.
907 MVT EltVT = VT.getVectorElementType();
908 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
911 VT, Custom);
912 }
913 }
914
915 // Promote fp16 arithmetic if fp16 hardware isn't available or the
916 // user passed --nvptx-no-fp16-math. The flag is useful because,
917 // although sm_53+ GPUs have some sort of FP16 support in
918 // hardware, only sm_53 and sm_60 have full implementation. Others
919 // only have token amount of hardware and are likely to run faster
920 // by using fp32 units instead.
921 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
922 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
923 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
924 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
925 // bf16 must be promoted to f32.
926 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
927 if (getOperationAction(Op, MVT::bf16) == Promote)
928 AddPromotedToType(Op, MVT::bf16, MVT::f32);
929 setOperationAction(Op, MVT::v2f32,
930 STI.hasF32x2Instructions() ? Legal : Expand);
931 }
932
933 // On SM80, we select add/mul/sub as fma to avoid promotion to float
934 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
935 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
936 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
938 }
939 }
940 }
941
942 // f16/f16x2 neg was introduced in PTX 60, SM_53.
943 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
944 STI.getPTXVersion() >= 60 &&
945 STI.allowFP16Math();
946 for (const auto &VT : {MVT::f16, MVT::v2f16})
948 IsFP16FP16x2NegAvailable ? Legal : Expand);
949
950 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
951 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
952 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
953 // (would be) Library functions.
954
955 // These map to conversion instructions for scalar FP types.
956 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
958 setOperationAction(Op, MVT::f16, Legal);
959 setOperationAction(Op, MVT::f32, Legal);
960 setOperationAction(Op, MVT::f64, Legal);
961 setOperationAction(Op, MVT::v2f16, Expand);
962 setOperationAction(Op, MVT::v2bf16, Expand);
963 setOperationAction(Op, MVT::v2f32, Expand);
964 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
965 if (getOperationAction(Op, MVT::bf16) == Promote)
966 AddPromotedToType(Op, MVT::bf16, MVT::f32);
967 }
968
969 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
971 }
972 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
973 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
976 }
977 }
978
979 // Expand v2f32 = fp_extend
981 // Expand v2[b]f16 = fp_round v2f32
982 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
983
984 // sm_80 only has conversions between f32 and bf16. Custom lower all other
985 // bf16 conversions.
986 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
987 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
990 VT, Custom);
991 }
994 MVT::bf16, Custom);
995 }
996
1000 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
1004 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
1005
1006 // 'Expand' implements FCOPYSIGN without calling an external library.
1013
1014 // These map to corresponding instructions for f32/f64. f16 must be
1015 // promoted to f32. v2f16 is expanded to f16, which is then promoted
1016 // to f32.
1017 for (const auto &Op :
1019 setOperationAction(Op, MVT::f16, Promote);
1020 setOperationAction(Op, MVT::f32, Legal);
1021 // only div/rem/sqrt are legal for f64
1022 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
1023 setOperationAction(Op, MVT::f64, Legal);
1024 }
1025 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
1026 setOperationAction(Op, MVT::bf16, Promote);
1027 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1028 }
1029 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
1030
1031 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
1032 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1033 if (STI.getPTXVersion() >= 65) {
1034 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1035 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1036 } else {
1038 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1039 }
1040 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1041 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1042 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1043 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1044
1045 for (const auto &Op :
1047 setOperationAction(Op, MVT::f32, Legal);
1048 setOperationAction(Op, MVT::f64, Legal);
1049 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1050 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1051 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1052 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1053 if (getOperationAction(Op, MVT::bf16) == Promote)
1054 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1055 setOperationAction(Op, MVT::v2f32, Expand);
1056 }
1057 bool SupportsF32MinMaxNaN =
1058 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1059 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1060 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1061 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1062 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1063 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1064 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1065 setOperationAction(Op, MVT::v2f32, Expand);
1066 }
1067
1068 // Custom lowering for inline asm with 128-bit operands
1071
1072 // FEXP2 support:
1073 // - f32
1074 // - f16/f16x2 (sm_70+, PTX 7.0+)
1075 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1076 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1078 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1079 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1080 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1081 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1082 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1083
1084 // FLOG2 supports f32 only
1085 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1086 if (UseApproxLog2F32) {
1088 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1089 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1090 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1091 Expand);
1092 }
1093
1094 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1095
1096 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1097
1098 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1099 // type, we need to custom lower it.
1101 Custom);
1102
1103 // Now deduce the information based on the above mentioned
1104 // actions
1105 computeRegisterProperties(STI.getRegisterInfo());
1106
1107 // PTX support for 16-bit CAS is emulated. Only use 32+
1108 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1109 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1111
1112 // Custom lowering for tcgen05.ld vector operands
1114 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1115 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::v2f32,
1116 MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32,
1117 MVT::v64f32, MVT::v128f32},
1118 Custom);
1119
1120 // Custom lowering for tcgen05.st vector operands
1122 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1123 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
1124 Custom);
1125
1126 // Enable custom lowering for the following:
1127 // * MVT::i128 - clusterlaunchcontrol
1128 // * MVT::i32 - prmt
1129 // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
1130 // * MVT::Other - internal.addrspace.wrap
1132 {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
1133
1134 // Custom lowering for bswap
1135 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
1136 Custom);
1137}
1138
1141 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1142 VT.getScalarType() == MVT::i1)
1143 return TypeSplitVector;
1145}
1146
1148 int Enabled, int &ExtraSteps,
1149 bool &UseOneConst,
1150 bool Reciprocal) const {
1153 return SDValue();
1154
1155 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1156 ExtraSteps = 0;
1157
1158 SDLoc DL(Operand);
1159 EVT VT = Operand.getValueType();
1160 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1161
1162 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1163 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1164 DAG.getConstant(IID, DL, MVT::i32), Operand);
1165 };
1166
1167 // The sqrt and rsqrt refinement processes assume we always start out with an
1168 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1169 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1170 // any refinement, we must return a regular sqrt.
1171 if (Reciprocal || ExtraSteps > 0) {
1172 if (VT == MVT::f32)
1173 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1174 : Intrinsic::nvvm_rsqrt_approx_f);
1175 else if (VT == MVT::f64)
1176 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1177 else
1178 return SDValue();
1179 } else {
1180 if (VT == MVT::f32)
1181 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1182 : Intrinsic::nvvm_sqrt_approx_f);
1183 else {
1184 // There's no sqrt.approx.f64 instruction, so we emit
1185 // reciprocal(rsqrt(x)). This is faster than
1186 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1187 // x * rsqrt(x).)
1188 return DAG.getNode(
1190 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1191 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1192 }
1193 }
1194}
1195
1196static Align getArgumentAlignment(const CallBase *CB, Type *Ty, unsigned Idx,
1197 const DataLayout &DL);
1198
1200 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1202 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1203 unsigned UniqueCallSite) const {
1204 auto PtrVT = getPointerTy(DL);
1205
1206 std::string Prototype;
1207 raw_string_ostream O(Prototype);
1208 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1209
1210 if (RetTy->isVoidTy()) {
1211 O << "()";
1212 } else {
1213 O << "(";
1214 if (shouldPassAsArray(RetTy)) {
1215 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1216 O << ".param .align " << RetAlign.value() << " .b8 _["
1217 << DL.getTypeAllocSize(RetTy) << "]";
1218 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1219 unsigned size = 0;
1220 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1221 size = ITy->getBitWidth();
1222 } else {
1223 assert(RetTy->isFloatingPointTy() &&
1224 "Floating point type expected here");
1225 size = RetTy->getPrimitiveSizeInBits();
1226 }
1227 // PTX ABI requires all scalar return values to be at least 32
1228 // bits in size. fp16 normally uses .b16 as its storage type in
1229 // PTX, so its size must be adjusted here, too.
1231
1232 O << ".param .b" << size << " _";
1233 } else if (isa<PointerType>(RetTy)) {
1234 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1235 } else {
1236 llvm_unreachable("Unknown return type");
1237 }
1238 O << ") ";
1239 }
1240 O << "_ (";
1241
1242 bool first = true;
1243
1244 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1245 auto AllOuts = ArrayRef(Outs);
1246 for (const unsigned I : llvm::seq(NumArgs)) {
1247 const auto ArgOuts =
1248 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1249 AllOuts = AllOuts.drop_front(ArgOuts.size());
1250
1251 Type *Ty = Args[I].Ty;
1252 if (!first) {
1253 O << ", ";
1254 }
1255 first = false;
1256
1257 if (ArgOuts[0].Flags.isByVal()) {
1258 // Indirect calls need strict ABI alignment so we disable optimizations by
1259 // not providing a function to optimize.
1260 Type *ETy = Args[I].IndirectType;
1261 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1262 Align ParamByValAlign =
1263 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1264
1265 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1266 << ArgOuts[0].Flags.getByValSize() << "]";
1267 } else {
1268 if (shouldPassAsArray(Ty)) {
1269 Align ParamAlign =
1270 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1271 O << ".param .align " << ParamAlign.value() << " .b8 _["
1272 << DL.getTypeAllocSize(Ty) << "]";
1273 continue;
1274 }
1275 // i8 types in IR will be i16 types in SDAG
1276 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1277 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1278 "type mismatch between callee prototype and arguments");
1279 // scalar type
1280 unsigned sz = 0;
1281 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1282 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1283 } else if (isa<PointerType>(Ty)) {
1284 sz = PtrVT.getSizeInBits();
1285 } else {
1286 sz = Ty->getPrimitiveSizeInBits();
1287 }
1288 O << ".param .b" << sz << " _";
1289 }
1290 }
1291
1292 if (FirstVAArg)
1293 O << (first ? "" : ",") << " .param .align "
1294 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1295 O << ")";
1296 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1297 O << " .noreturn";
1298 O << ";";
1299
1300 return Prototype;
1301}
1302
1303static Align getArgumentAlignment(const CallBase *CB, Type *Ty, unsigned Idx,
1304 const DataLayout &DL) {
1305 if (!CB) {
1306 // CallSite is zero, fallback to ABI type alignment
1307 return DL.getABITypeAlign(Ty);
1308 }
1309
1310 const Function *DirectCallee = CB->getCalledFunction();
1311
1312 if (!DirectCallee) {
1313 // We don't have a direct function symbol, but that may be because of
1314 // constant cast instructions in the call.
1315
1316 // With bitcast'd call targets, the instruction will be the call
1317 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1318 // Check if we have call alignment metadata
1319 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1320 return StackAlign.value();
1321 }
1322 DirectCallee = getMaybeBitcastedCallee(CB);
1323 }
1324
1325 // Check for function alignment information if we found that the
1326 // ultimate target is a Function
1327 if (DirectCallee)
1328 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1329
1330 // Call is indirect, fall back to the ABI type alignment
1331 return DL.getABITypeAlign(Ty);
1332}
1333
1335 const DataLayout &DL,
1336 const TargetLowering &TL) {
1337 if (Ptr->getOpcode() == ISD::FrameIndex) {
1338 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1339 Ptr = DAG.getAddrSpaceCast(SDLoc(), Ty, Ptr, ADDRESS_SPACE_GENERIC,
1341
1343 }
1344
1345 // Peel of an addrspacecast to generic and load directly from the specific
1346 // address space.
1347 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1348 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1349 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1350 Ptr = ASC->getOperand(0);
1351 return MachinePointerInfo(ASC->getSrcAddressSpace());
1352 }
1353 }
1354
1355 return MachinePointerInfo();
1356}
1357
1359 if (Flags.isSExt())
1360 return ISD::SIGN_EXTEND;
1361 if (Flags.isZExt())
1362 return ISD::ZERO_EXTEND;
1363 return ISD::ANY_EXTEND;
1364}
1365
1367 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1368 SDLoc dl) {
1369 const EVT ActualVT = V.getValueType();
1370 assert((ActualVT == ExpectedVT ||
1371 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1372 "Non-integer argument type size mismatch");
1373 if (ExpectedVT.bitsGT(ActualVT))
1374 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1375 if (ExpectedVT.bitsLT(ActualVT))
1376 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1377
1378 return V;
1379}
1380
1382 SmallVectorImpl<SDValue> &InVals) const {
1383
1384 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1386 "Support for variadic functions (unsized array parameter) introduced "
1387 "in PTX ISA version 6.0 and requires target sm_30.");
1388
1389 SelectionDAG &DAG = CLI.DAG;
1390 SDLoc dl = CLI.DL;
1391 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1392 SDValue Callee = CLI.Callee;
1393 ArgListTy &Args = CLI.getArgs();
1394 Type *RetTy = CLI.RetTy;
1395 const CallBase *CB = CLI.CB;
1396 const DataLayout &DL = DAG.getDataLayout();
1397 LLVMContext &Ctx = *DAG.getContext();
1398
1399 const auto GetI32 = [&](const unsigned I) {
1400 return DAG.getConstant(I, dl, MVT::i32);
1401 };
1402
1403 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1404 const SDValue CallChain = CLI.Chain;
1405 const SDValue StartChain =
1406 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1407 SDValue DeclareGlue = StartChain.getValue(1);
1408
1409 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1410
1411 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1412 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1413 // loaded/stored using i16, so it's handled here as well.
1414 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1415 SDValue Declare =
1416 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1417 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1418 CallPrereqs.push_back(Declare);
1419 DeclareGlue = Declare.getValue(1);
1420 return Declare;
1421 };
1422
1423 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1424 unsigned Size) {
1425 SDValue Declare = DAG.getNode(
1426 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1427 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1428 CallPrereqs.push_back(Declare);
1429 DeclareGlue = Declare.getValue(1);
1430 return Declare;
1431 };
1432
1433 // Variadic arguments.
1434 //
1435 // Normally, for each argument, we declare a param scalar or a param
1436 // byte array in the .param space, and store the argument value to that
1437 // param scalar or array starting at offset 0.
1438 //
1439 // In the case of the first variadic argument, we declare a vararg byte array
1440 // with size 0. The exact size of this array isn't known at this point, so
1441 // it'll be patched later. All the variadic arguments will be stored to this
1442 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1443 // initially set to 0, so it can be used for non-variadic arguments (which use
1444 // 0 offset) to simplify the code.
1445 //
1446 // After all vararg is processed, 'VAOffset' holds the size of the
1447 // vararg byte array.
1448 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1449 "Non-VarArg function with extra arguments");
1450
1451 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1452 unsigned VAOffset = 0; // current offset in the param array
1453
1454 const SDValue VADeclareParam =
1455 CLI.Args.size() > FirstVAArg
1456 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1457 Align(STI.getMaxRequiredAlignment()), 0)
1458 : SDValue();
1459
1460 // Args.size() and Outs.size() need not match.
1461 // Outs.size() will be larger
1462 // * if there is an aggregate argument with multiple fields (each field
1463 // showing up separately in Outs)
1464 // * if there is a vector argument with more than typical vector-length
1465 // elements (generally if more than 4) where each vector element is
1466 // individually present in Outs.
1467 // So a different index should be used for indexing into Outs/OutVals.
1468 // See similar issue in LowerFormalArguments.
1469 auto AllOuts = ArrayRef(CLI.Outs);
1470 auto AllOutVals = ArrayRef(CLI.OutVals);
1471 assert(AllOuts.size() == AllOutVals.size() &&
1472 "Outs and OutVals must be the same size");
1473 // Declare the .params or .reg need to pass values
1474 // to the function
1475 for (const auto E : llvm::enumerate(Args)) {
1476 const auto ArgI = E.index();
1477 const auto Arg = E.value();
1478 const auto ArgOuts =
1479 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1480 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1481 AllOuts = AllOuts.drop_front(ArgOuts.size());
1482 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1483
1484 const bool IsVAArg = (ArgI >= FirstVAArg);
1485 const bool IsByVal = Arg.IsByVal;
1486
1487 const SDValue ParamSymbol =
1488 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1489
1490 assert((!IsByVal || Arg.IndirectType) &&
1491 "byval arg must have indirect type");
1492 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1493
1494 const Align ArgAlign = [&]() {
1495 if (IsByVal) {
1496 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1497 // so we don't need to worry whether it's naturally aligned or not.
1498 // See TargetLowering::LowerCallTo().
1499 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1501 InitialAlign, DL);
1502 }
1503 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1504 }();
1505
1506 const unsigned TySize = DL.getTypeAllocSize(ETy);
1507 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1508 "type size mismatch");
1509
1510 const SDValue ArgDeclare = [&]() {
1511 if (IsVAArg)
1512 return VADeclareParam;
1513
1514 if (IsByVal || shouldPassAsArray(Arg.Ty))
1515 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1516
1517 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1518 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1519 "Only int and float types are supported as non-array arguments");
1520
1521 return MakeDeclareScalarParam(ParamSymbol, TySize);
1522 }();
1523
1524 if (IsByVal) {
1525 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1526 SDValue SrcPtr = ArgOutVals[0];
1527 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1528 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1529
1530 if (IsVAArg)
1531 VAOffset = alignTo(VAOffset, ArgAlign);
1532
1533 SmallVector<EVT, 4> ValueVTs, MemVTs;
1535 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1536
1537 unsigned J = 0;
1538 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1539 for (const unsigned NumElts : VI) {
1540 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1541 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1542 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1543 SDValue SrcLoad =
1544 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1545
1546 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1547 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1548 SDValue ParamAddr =
1549 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1550 SDValue StoreParam = DAG.getStore(
1551 ArgDeclare, dl, SrcLoad, ParamAddr,
1553 CallPrereqs.push_back(StoreParam);
1554
1555 J += NumElts;
1556 }
1557 if (IsVAArg)
1558 VAOffset += TySize;
1559 } else {
1562 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1563 VAOffset);
1564 assert(VTs.size() == Offsets.size() && "Size mismatch");
1565 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1566
1567 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1568 // than 32-bits are sign extended or zero extended, depending on
1569 // whether they are signed or unsigned types. This case applies
1570 // only to scalar parameters and not to aggregate values.
1571 const bool ExtendIntegerParam =
1572 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1573
1574 const auto GetStoredValue = [&](const unsigned I) {
1575 SDValue StVal = ArgOutVals[I];
1577 StVal.getValueType() &&
1578 "OutVal type should always be legal");
1579
1580 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1581 const EVT StoreVT =
1582 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1583
1584 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1585 };
1586
1587 unsigned J = 0;
1588 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1589 for (const unsigned NumElts : VI) {
1590 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1591
1592 unsigned Offset;
1593 if (IsVAArg) {
1594 // TODO: We may need to support vector types that can be passed
1595 // as scalars in variadic arguments.
1596 assert(NumElts == 1 &&
1597 "Vectorization should be disabled for vaargs.");
1598
1599 // Align each part of the variadic argument to their type.
1600 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1601 Offset = VAOffset;
1602
1603 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1604 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1605 } else {
1606 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1607 Offset = Offsets[J];
1608 }
1609
1610 SDValue Ptr =
1611 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1612
1613 const MaybeAlign CurrentAlign = ExtendIntegerParam
1614 ? MaybeAlign(std::nullopt)
1615 : commonAlignment(ArgAlign, Offset);
1616
1617 SDValue Val =
1618 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1619 return GetStoredValue(J + K);
1620 });
1621
1622 SDValue StoreParam = DAG.getStore(
1623 ArgDeclare, dl, Val, Ptr,
1625 CallPrereqs.push_back(StoreParam);
1626
1627 J += NumElts;
1628 }
1629 }
1630 }
1631
1632 // Handle Result
1633 if (!Ins.empty()) {
1634 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1635 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1636 if (shouldPassAsArray(RetTy)) {
1637 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1638 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1639 } else {
1640 MakeDeclareScalarParam(RetSymbol, ResultSize);
1641 }
1642 }
1643
1644 // Set the size of the vararg param byte array if the callee is a variadic
1645 // function and the variadic part is not empty.
1646 if (VADeclareParam) {
1647 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1648 VADeclareParam.getOperand(1),
1649 VADeclareParam.getOperand(2), GetI32(VAOffset),
1650 VADeclareParam.getOperand(4)};
1651 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1652 VADeclareParam->getVTList(), DeclareParamOps);
1653 }
1654
1655 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1656 const auto *CalleeF = Func ? dyn_cast<Function>(Func->getGlobal()) : nullptr;
1657
1658 // If the type of the callsite does not match that of the function, convert
1659 // the callsite to an indirect call.
1660 const bool ConvertToIndirectCall =
1661 CalleeF && CB->getFunctionType() != CalleeF->getFunctionType();
1662
1663 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1664 // between them we must rely on the call site value which is valid for
1665 // indirect calls but is always null for libcalls.
1666 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1667
1668 if (isa<ExternalSymbolSDNode>(Callee)) {
1669 Function* CalleeFunc = nullptr;
1670
1671 // Try to find the callee in the current module.
1672 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1673 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1674
1675 // Set the "libcall callee" attribute to indicate that the function
1676 // must always have a declaration.
1677 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1678 }
1679
1680 if (IsIndirectCall) {
1681 // This is indirect function call case : PTX requires a prototype of the
1682 // form
1683 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1684 // to be emitted, and the label has to used as the last arg of call
1685 // instruction.
1686 // The prototype is embedded in a string and put as the operand for a
1687 // CallPrototype SDNode which will print out to the value of the string.
1688 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1689 std::string Proto =
1690 getPrototype(DL, RetTy, Args, CLI.Outs,
1691 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1692 UniqueCallSite);
1693 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1694 const SDValue PrototypeDeclare = DAG.getNode(
1695 NVPTXISD::CallPrototype, dl, MVT::Other,
1696 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1697 CallPrereqs.push_back(PrototypeDeclare);
1698 }
1699
1700 const bool IsUnknownIntrinsic =
1701 CalleeF && CalleeF->isIntrinsic() &&
1702 CalleeF->getIntrinsicID() == Intrinsic::not_intrinsic;
1703 if (IsUnknownIntrinsic) {
1706 "call to unknown intrinsic '" + CalleeF->getName() +
1707 "' cannot be lowered by the NVPTX backend",
1708 dl.getDebugLoc()));
1709 }
1710
1711 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1712 const unsigned NumArgs =
1713 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1714 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1715 /// NumParams, Callee, Proto)
1716 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1717 const SDValue Call = DAG.getNode(
1718 NVPTXISD::CALL, dl, MVT::Other,
1719 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1720 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1721
1722 SmallVector<SDValue, 16> LoadChains{Call};
1723 SmallVector<SDValue, 16> ProxyRegOps;
1724 if (!Ins.empty()) {
1727 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1728 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1729
1730 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1731 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1732
1733 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1734 // 32-bits are sign extended or zero extended, depending on whether
1735 // they are signed or unsigned types.
1736 const bool ExtendIntegerRetVal =
1737 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1738
1739 unsigned I = 0;
1740 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1741 for (const unsigned NumElts : VI) {
1742 const MaybeAlign CurrentAlign =
1743 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1744 : commonAlignment(RetAlign, Offsets[I]);
1745
1746 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1747 const EVT LoadVT =
1748 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1749 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1750 SDValue Ptr =
1751 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1752
1753 SDValue R = DAG.getLoad(
1754 VecVT, dl, Call, Ptr,
1756
1757 LoadChains.push_back(R.getValue(1));
1758 for (const unsigned J : llvm::seq(NumElts))
1759 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1760 I += NumElts;
1761 }
1762 }
1763
1764 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1765 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1766 UniqueCallSite + 1, SDValue(), dl);
1767
1768 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1769 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1770 // dangling.
1771 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1772 SDValue Proxy =
1773 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1774 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1775 InVals.push_back(Ret);
1776 }
1777
1778 // set IsTailCall to false for now, until we figure out how to express
1779 // tail call optimization in PTX
1780 CLI.IsTailCall = false;
1781 return CallEnd;
1782}
1783
1785 SelectionDAG &DAG) const {
1786
1787 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1788 const Function &Fn = DAG.getMachineFunction().getFunction();
1789
1791 Fn,
1792 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1793 "requires target sm_52.",
1794 SDLoc(Op).getDebugLoc()));
1795 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1796 Op.getOperand(0)};
1797 return DAG.getMergeValues(Ops, SDLoc());
1798 }
1799
1800 SDLoc DL(Op.getNode());
1801 SDValue Chain = Op.getOperand(0);
1802 SDValue Size = Op.getOperand(1);
1803 uint64_t Align = Op.getConstantOperandVal(2);
1804
1805 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1806 // the default stack alignment should be used.
1807 if (Align == 0)
1809
1810 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1811 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1812
1813 SDValue Alloc =
1814 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1815 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1816 DAG.getTargetConstant(Align, DL, MVT::i32)});
1817
1818 SDValue ASC = DAG.getAddrSpaceCast(
1820
1821 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1822}
1823
1825 SelectionDAG &DAG) const {
1826 SDLoc DL(Op.getNode());
1827 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1828 const Function &Fn = DAG.getMachineFunction().getFunction();
1829
1831 Fn,
1832 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1833 ">= sm_52.",
1834 DL.getDebugLoc()));
1835 return Op.getOperand(0);
1836 }
1837
1838 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1839 SDValue Chain = Op.getOperand(0);
1840 SDValue Ptr = Op.getOperand(1);
1841 SDValue ASC = DAG.getAddrSpaceCast(DL, LocalVT, Ptr, ADDRESS_SPACE_GENERIC,
1843 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1844}
1845
1847 SelectionDAG &DAG) const {
1848 SDLoc DL(Op.getNode());
1849 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1850 const Function &Fn = DAG.getMachineFunction().getFunction();
1851
1853 Fn,
1854 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1855 "sm_52.",
1856 DL.getDebugLoc()));
1857 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1858 return DAG.getMergeValues(Ops, DL);
1859 }
1860
1861 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1862 SDValue Chain = Op.getOperand(0);
1863 SDValue SS =
1864 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1865 SDValue ASC = DAG.getAddrSpaceCast(
1866 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1867 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1868}
1869
1870// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1871// (see LegalizeDAG.cpp). This is slow and uses local memory.
1872// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1873SDValue
1874NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1875 SDNode *Node = Op.getNode();
1876 SDLoc dl(Node);
1878 unsigned NumOperands = Node->getNumOperands();
1879 for (unsigned i = 0; i < NumOperands; ++i) {
1880 SDValue SubOp = Node->getOperand(i);
1881 EVT VVT = SubOp.getNode()->getValueType(0);
1882 EVT EltVT = VVT.getVectorElementType();
1883 unsigned NumSubElem = VVT.getVectorNumElements();
1884 for (unsigned j = 0; j < NumSubElem; ++j) {
1885 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1886 DAG.getIntPtrConstant(j, dl)));
1887 }
1888 }
1889 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1890}
1891
1893 SelectionDAG &DAG,
1894 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1895 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1896 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1897 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1898 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1899}
1900
1902 SelectionDAG &DAG,
1903 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1904 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1905}
1906
1907/// Reduces the elements using the scalar operations provided. The operations
1908/// are sorted descending in number of inputs they take. The flags on the
1909/// original reduction operation will be propagated to each scalar operation.
1910/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1911/// used in ExpandReductions and SelectionDAG.
1913 const SmallVector<SDValue> &Elements, EVT EltTy,
1914 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1915 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1916 // Build the reduction tree at each level, starting with all the elements.
1917 SmallVector<SDValue> Level = Elements;
1918
1919 unsigned OpIdx = 0;
1920 while (Level.size() > 1) {
1921 // Try to reduce this level using the current operator.
1922 const auto [Op, NumInputs] = Ops[OpIdx];
1923
1924 // Build the next level by partially reducing all elements.
1925 SmallVector<SDValue> ReducedLevel;
1926 unsigned I = 0, E = Level.size();
1927 for (; I + NumInputs <= E; I += NumInputs) {
1928 // Reduce elements in groups of [NumInputs], as much as possible.
1929 ReducedLevel.push_back(DAG.getNode(
1930 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1931 }
1932
1933 if (I < E) {
1934 // Handle leftover elements.
1935
1936 if (ReducedLevel.empty()) {
1937 // We didn't reduce anything at this level. We need to pick a smaller
1938 // operator.
1939 ++OpIdx;
1940 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1941 continue;
1942 }
1943
1944 // We reduced some things but there's still more left, meaning the
1945 // operator's number of inputs doesn't evenly divide this level size. Move
1946 // these elements to the next level.
1947 for (; I < E; ++I)
1948 ReducedLevel.push_back(Level[I]);
1949 }
1950
1951 // Process the next level.
1952 Level = ReducedLevel;
1953 }
1954
1955 return *Level.begin();
1956}
1957
1958// Get scalar reduction opcode
1959static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
1960 switch (ReductionOpcode) {
1962 return ISD::FMAXNUM;
1964 return ISD::FMINNUM;
1966 return ISD::FMAXIMUM;
1968 return ISD::FMINIMUM;
1969 default:
1970 llvm_unreachable("unhandled reduction opcode");
1971 }
1972}
1973
1974/// Get 3-input scalar reduction opcode
1975static std::optional<unsigned>
1976getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
1977 switch (ReductionOpcode) {
1979 return NVPTXISD::FMAXNUM3;
1981 return NVPTXISD::FMINNUM3;
1983 return NVPTXISD::FMAXIMUM3;
1985 return NVPTXISD::FMINIMUM3;
1986 default:
1987 return std::nullopt;
1988 }
1989}
1990
1991/// Lower reductions to either a sequence of operations or a tree if
1992/// reassociations are allowed. This method will use larger operations like
1993/// max3/min3 when the target supports them.
1994SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
1995 SelectionDAG &DAG) const {
1996 SDLoc DL(Op);
1997 const SDNodeFlags Flags = Op->getFlags();
1998 SDValue Vector = Op.getOperand(0);
1999
2000 const unsigned Opcode = Op->getOpcode();
2001 const EVT EltTy = Vector.getValueType().getVectorElementType();
2002
2003 // Whether we can use 3-input min/max when expanding the reduction.
2004 const bool CanUseMinMax3 =
2005 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2006 STI.getPTXVersion() >= 88 &&
2007 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2008 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2009
2010 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2011 // number of inputs they take.
2012 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2013
2014 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2015 CanUseMinMax3 && Opcode3Elem)
2016 ScalarOps.push_back({*Opcode3Elem, 3});
2017 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2018
2020 DAG.ExtractVectorElements(Vector, Elements);
2021
2022 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2023}
2024
2025SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2026 // Handle bitcasting from v2i8 without hitting the default promotion
2027 // strategy which goes through stack memory.
2028 EVT FromVT = Op->getOperand(0)->getValueType(0);
2029 if (FromVT != MVT::v2i8) {
2030 return Op;
2031 }
2032
2033 // Pack vector elements into i16 and bitcast to final type
2034 SDLoc DL(Op);
2035 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2036 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2037 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2038 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2039 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2040 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2041 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2042 SDValue AsInt = DAG.getNode(
2043 ISD::OR, DL, MVT::i16,
2044 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2045 EVT ToVT = Op->getValueType(0);
2046 return DAG.getBitcast(ToVT, AsInt);
2047}
2048
2049// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2050// would get lowered as two constant loads and vector-packing move.
2051// Instead we want just a constant move:
2052// mov.b32 %r2, 0x40003C00
2053SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2054 SelectionDAG &DAG) const {
2055 EVT VT = Op->getValueType(0);
2056 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2057 return Op;
2058 SDLoc DL(Op);
2059
2060 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2061 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2062 isa<ConstantFPSDNode>(Operand);
2063 })) {
2064 if (VT != MVT::v4i8)
2065 return Op;
2066 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2067 // to optimize calculation of constant parts.
2068 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2069 uint64_t SelectionValue) -> SDValue {
2070 SDValue L = Left;
2071 SDValue R = Right;
2072 if (Cast) {
2073 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2074 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2075 }
2076 return getPRMT(L, R, SelectionValue, DL, DAG);
2077 };
2078 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2079 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2080 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2081 return DAG.getBitcast(VT, PRMT3210);
2082 }
2083
2084 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2085 auto GetOperand = [](SDValue Op, int N) -> APInt {
2086 const SDValue &Operand = Op->getOperand(N);
2087 EVT VT = Op->getValueType(0);
2088 if (Operand->isUndef())
2089 return APInt(32, 0);
2090 APInt Value;
2091 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2092 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2093 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2094 Value = Operand->getAsAPIntVal();
2095 else
2096 llvm_unreachable("Unsupported type");
2097 // i8 values are carried around as i16, so we need to zero out upper bits,
2098 // so they do not get in the way of combining individual byte values
2099 if (VT == MVT::v4i8)
2100 Value = Value.trunc(8);
2101 return Value.zext(32);
2102 };
2103
2104 // Construct a 32-bit constant by shifting into place smaller values
2105 // (elements of the vector type VT).
2106 // For example, if VT has 2 elements, then N == 2:
2107 // ShiftAmount = 32 / N = 16
2108 // Value |= Op0 (b16) << 0
2109 // Value |= Op1 (b16) << 16
2110 // If N == 4:
2111 // ShiftAmount = 32 / N = 8
2112 // Value |= Op0 (b8) << 0
2113 // Value |= Op1 (b8) << 8
2114 // Value |= Op2 (b8) << 16
2115 // Value |= Op3 (b8) << 24
2116 // ...etc
2117 APInt Value(32, 0);
2118 const unsigned NumElements = VT.getVectorNumElements();
2119 assert(32 % NumElements == 0 && "must evenly divide bit length");
2120 const unsigned ShiftAmount = 32 / NumElements;
2121 for (unsigned ElementNo : seq(NumElements))
2122 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2123 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2124 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2125}
2126
2127SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2128 SelectionDAG &DAG) const {
2129 SDValue Index = Op->getOperand(1);
2130 SDValue Vector = Op->getOperand(0);
2131 SDLoc DL(Op);
2132 EVT VectorVT = Vector.getValueType();
2133
2134 if (VectorVT == MVT::v4i8) {
2135 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2136 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2137 DAG.getConstant(0x7770, DL, MVT::i32));
2138 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2139 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2140 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2141 SDNodeFlags Flags;
2142 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2143 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2144 Ext->setFlags(Flags);
2145 return Ext;
2146 }
2147
2148 // Constant index will be matched by tablegen.
2149 if (isa<ConstantSDNode>(Index.getNode()))
2150 return Op;
2151
2152 // Extract individual elements and select one of them.
2153 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2154 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2155 EVT EltVT = VectorVT.getVectorElementType();
2156
2157 SDLoc dl(Op.getNode());
2158 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2159 DAG.getIntPtrConstant(0, dl));
2160 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2161 DAG.getIntPtrConstant(1, dl));
2162 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2164}
2165
2166SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2167 SelectionDAG &DAG) const {
2168 SDValue Vector = Op->getOperand(0);
2169 EVT VectorVT = Vector.getValueType();
2170
2171 if (VectorVT != MVT::v4i8)
2172 return Op;
2173 SDLoc DL(Op);
2174 SDValue Value = Op->getOperand(1);
2175 if (Value->isUndef())
2176 return Vector;
2177
2178 SDValue Index = Op->getOperand(2);
2179
2180 SDValue BFI =
2181 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2182 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2183 DAG.getNode(ISD::MUL, DL, MVT::i32,
2184 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2185 DAG.getConstant(8, DL, MVT::i32)),
2186 DAG.getConstant(8, DL, MVT::i32)});
2187 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2188}
2189
2190SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2191 SelectionDAG &DAG) const {
2192 SDValue V1 = Op.getOperand(0);
2193 EVT VectorVT = V1.getValueType();
2194 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2195 return Op;
2196
2197 // Lower shuffle to PRMT instruction.
2198 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2199 SDValue V2 = Op.getOperand(1);
2200 uint32_t Selector = 0;
2201 for (auto I : llvm::enumerate(SVN->getMask())) {
2202 if (I.value() != -1) // -1 is a placeholder for undef.
2203 Selector |= (I.value() << (I.index() * 4));
2204 }
2205
2206 SDLoc DL(Op);
2207 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2208 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2209 return DAG.getBitcast(Op.getValueType(), PRMT);
2210}
2211/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2212/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2213/// amount, or
2214/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2215/// amount.
2216SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2217 SelectionDAG &DAG) const {
2218 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2219 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2220
2221 EVT VT = Op.getValueType();
2222 unsigned VTBits = VT.getSizeInBits();
2223 SDLoc dl(Op);
2224 SDValue ShOpLo = Op.getOperand(0);
2225 SDValue ShOpHi = Op.getOperand(1);
2226 SDValue ShAmt = Op.getOperand(2);
2227 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2228
2229 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2230 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2231 // {dHi, dLo} = {aHi, aLo} >> Amt
2232 // dHi = aHi >> Amt
2233 // dLo = shf.r.clamp aLo, aHi, Amt
2234
2235 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2236 SDValue Lo =
2237 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2238
2239 SDValue Ops[2] = { Lo, Hi };
2240 return DAG.getMergeValues(Ops, dl);
2241 }
2242 else {
2243 // {dHi, dLo} = {aHi, aLo} >> Amt
2244 // - if (Amt>=size) then
2245 // dLo = aHi >> (Amt-size)
2246 // dHi = aHi >> Amt (this is either all 0 or all 1)
2247 // else
2248 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2249 // dHi = aHi >> Amt
2250
2251 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2252 DAG.getConstant(VTBits, dl, MVT::i32),
2253 ShAmt);
2254 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2255 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2256 DAG.getConstant(VTBits, dl, MVT::i32));
2257 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2258 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2259 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2260
2261 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2262 DAG.getConstant(VTBits, dl, MVT::i32),
2263 ISD::SETGE);
2264 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2265 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2266
2267 SDValue Ops[2] = { Lo, Hi };
2268 return DAG.getMergeValues(Ops, dl);
2269 }
2270}
2271
2272/// LowerShiftLeftParts - Lower SHL_PARTS, which
2273/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2274/// amount, or
2275/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2276/// amount.
2277SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2278 SelectionDAG &DAG) const {
2279 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2280 assert(Op.getOpcode() == ISD::SHL_PARTS);
2281
2282 EVT VT = Op.getValueType();
2283 unsigned VTBits = VT.getSizeInBits();
2284 SDLoc dl(Op);
2285 SDValue ShOpLo = Op.getOperand(0);
2286 SDValue ShOpHi = Op.getOperand(1);
2287 SDValue ShAmt = Op.getOperand(2);
2288
2289 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2290 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2291 // {dHi, dLo} = {aHi, aLo} << Amt
2292 // dHi = shf.l.clamp aLo, aHi, Amt
2293 // dLo = aLo << Amt
2294
2295 SDValue Hi =
2296 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2297 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2298
2299 SDValue Ops[2] = { Lo, Hi };
2300 return DAG.getMergeValues(Ops, dl);
2301 }
2302 else {
2303 // {dHi, dLo} = {aHi, aLo} << Amt
2304 // - if (Amt>=size) then
2305 // dLo = aLo << Amt (all 0)
2306 // dLo = aLo << (Amt-size)
2307 // else
2308 // dLo = aLo << Amt
2309 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2310
2311 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2312 DAG.getConstant(VTBits, dl, MVT::i32),
2313 ShAmt);
2314 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2315 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2316 DAG.getConstant(VTBits, dl, MVT::i32));
2317 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2318 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2319 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2320
2321 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2322 DAG.getConstant(VTBits, dl, MVT::i32),
2323 ISD::SETGE);
2324 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2325 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2326
2327 SDValue Ops[2] = { Lo, Hi };
2328 return DAG.getMergeValues(Ops, dl);
2329 }
2330}
2331
2332/// If the types match, convert the generic copysign to the NVPTXISD version,
2333/// otherwise bail ensuring that mismatched cases are properly expaned.
2334SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2335 SelectionDAG &DAG) const {
2336 EVT VT = Op.getValueType();
2337 SDLoc DL(Op);
2338
2339 SDValue In1 = Op.getOperand(0);
2340 SDValue In2 = Op.getOperand(1);
2341 EVT SrcVT = In2.getValueType();
2342
2343 if (!SrcVT.bitsEq(VT))
2344 return SDValue();
2345
2346 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2347}
2348
2349SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2350 EVT VT = Op.getValueType();
2351
2352 if (VT == MVT::f32)
2353 return LowerFROUND32(Op, DAG);
2354
2355 if (VT == MVT::f64)
2356 return LowerFROUND64(Op, DAG);
2357
2358 llvm_unreachable("unhandled type");
2359}
2360
2361// This is the the rounding method used in CUDA libdevice in C like code:
2362// float roundf(float A)
2363// {
2364// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2365// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2366// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2367// }
2368SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2369 SelectionDAG &DAG) const {
2370 SDLoc SL(Op);
2371 SDValue A = Op.getOperand(0);
2372 EVT VT = Op.getValueType();
2373
2374 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2375
2376 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2377 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2378 const unsigned SignBitMask = 0x80000000;
2379 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2380 DAG.getConstant(SignBitMask, SL, MVT::i32));
2381 const unsigned PointFiveInBits = 0x3F000000;
2382 SDValue PointFiveWithSignRaw =
2383 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2384 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2385 SDValue PointFiveWithSign =
2386 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2387 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2388 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2389
2390 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2391 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2392 SDValue IsLarge =
2393 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2394 ISD::SETOGT);
2395 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2396
2397 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2398 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2399 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2400 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2401 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2402}
2403
2404// The implementation of round(double) is similar to that of round(float) in
2405// that they both separate the value range into three regions and use a method
2406// specific to the region to round the values. However, round(double) first
2407// calculates the round of the absolute value and then adds the sign back while
2408// round(float) directly rounds the value with sign.
2409SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2410 SelectionDAG &DAG) const {
2411 SDLoc SL(Op);
2412 SDValue A = Op.getOperand(0);
2413 EVT VT = Op.getValueType();
2414
2415 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2416
2417 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2418 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2419 DAG.getConstantFP(0.5, SL, VT));
2420 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2421
2422 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2423 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2424 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2425 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2426 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2427 DAG.getConstantFP(0, SL, VT),
2428 RoundedA);
2429
2430 // Add sign to rounded_A
2431 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2432 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2433
2434 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2435 SDValue IsLarge =
2436 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2437 ISD::SETOGT);
2438 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2439}
2440
2442 EVT VT = N->getValueType(0);
2443 EVT NVT = MVT::f32;
2444 if (VT.isVector()) {
2445 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2446 }
2447 SDLoc DL(N);
2448 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2449 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2450 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2451 return DAG.getFPExtendOrRound(Res, DL, VT);
2452}
2453
2454SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2455 SelectionDAG &DAG) const {
2456 if (useF32FTZ(DAG.getMachineFunction())) {
2457 return PromoteBinOpToF32(Op.getNode(), DAG);
2458 }
2459 return Op;
2460}
2461
2462SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2463 SelectionDAG &DAG) const {
2464 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2465
2466 if (Op.getValueType() == MVT::bf16) {
2467 SDLoc Loc(Op);
2468 return DAG.getNode(
2469 ISD::FP_ROUND, Loc, MVT::bf16,
2470 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2471 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2472 }
2473
2474 // Everything else is considered legal.
2475 return Op;
2476}
2477
2478SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2479 SelectionDAG &DAG) const {
2480 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2481
2482 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2483 SDLoc Loc(Op);
2484 return DAG.getNode(
2485 Op.getOpcode(), Loc, Op.getValueType(),
2486 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2487 }
2488
2489 // Everything else is considered legal.
2490 return Op;
2491}
2492
2493SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2494 SelectionDAG &DAG) const {
2495 EVT NarrowVT = Op.getValueType();
2496 SDValue Wide = Op.getOperand(0);
2497 EVT WideVT = Wide.getValueType();
2498 if (NarrowVT.getScalarType() == MVT::bf16) {
2499 const TargetLowering *TLI = STI.getTargetLowering();
2500 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2501 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2502 }
2503 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2504 // This combination was the first to support f32 -> bf16.
2505 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2506 if (WideVT.getScalarType() == MVT::f32) {
2507 return Op;
2508 }
2509 if (WideVT.getScalarType() == MVT::f64) {
2510 SDLoc Loc(Op);
2511 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2512 // the hardware f32 -> bf16 instruction.
2514 WideVT.changeElementType(*DAG.getContext(), MVT::f32), Wide, Loc,
2515 DAG);
2516 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2517 }
2518 }
2519 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2520 }
2521 }
2522
2523 // Everything else is considered legal.
2524 return Op;
2525}
2526
2527SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2528 SelectionDAG &DAG) const {
2529 SDValue Narrow = Op.getOperand(0);
2530 EVT NarrowVT = Narrow.getValueType();
2531 EVT WideVT = Op.getValueType();
2532 if (NarrowVT.getScalarType() == MVT::bf16) {
2533 if (WideVT.getScalarType() == MVT::f32 &&
2534 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2535 SDLoc Loc(Op);
2536 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2537 }
2538 if (WideVT.getScalarType() == MVT::f64 &&
2539 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2540 EVT F32 = NarrowVT.changeElementType(*DAG.getContext(), MVT::f32);
2541 SDLoc Loc(Op);
2542 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2543 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2544 } else {
2545 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2546 }
2547 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2548 }
2549 }
2550
2551 // Everything else is considered legal.
2552 return Op;
2553}
2554
2556 SDLoc DL(Op);
2557 if (Op.getValueType() != MVT::v2i16)
2558 return Op;
2559 EVT EltVT = Op.getValueType().getVectorElementType();
2560 SmallVector<SDValue> VecElements;
2561 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2562 SmallVector<SDValue> ScalarArgs;
2563 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2564 [&](const SDUse &O) {
2565 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2566 O.get(), DAG.getIntPtrConstant(I, DL));
2567 });
2568 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2569 }
2570 SDValue V =
2571 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2572 return V;
2573}
2574
2576 bool hasOffset = false) {
2577 // skip lowering if the vector operand is already legalized
2578 if (!Op->getOperand(hasOffset ? 4 : 3).getValueType().isVector())
2579 return Op;
2580
2581 SDNode *N = Op.getNode();
2582 SDLoc DL(N);
2584
2585 // split the vector argument
2586 for (size_t I = 0; I < N->getNumOperands(); I++) {
2587 SDValue Val = N->getOperand(I);
2588 EVT ValVT = Val.getValueType();
2589 if (ValVT.isVector()) {
2590 EVT EltVT = ValVT.getVectorElementType();
2591 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2592 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2593 DAG.getIntPtrConstant(J, DL)));
2594 } else
2595 Ops.push_back(Val);
2596 }
2597
2599 SDValue Tcgen05StNode =
2600 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2601 MemSD->getMemoryVT(), MemSD->getMemOperand());
2602
2603 return Tcgen05StNode;
2604}
2605
2607 SDLoc DL(Op);
2608 SDValue Src = Op.getOperand(0);
2609 EVT VT = Op.getValueType();
2610
2611 switch (VT.getSimpleVT().SimpleTy) {
2612 case MVT::i16: {
2613 SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
2614 SDValue Swapped =
2615 getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
2616 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
2617 }
2618 case MVT::i32: {
2619 return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
2620 }
2621 case MVT::v2i16: {
2622 SDValue Converted = DAG.getBitcast(MVT::i32, Src);
2623 SDValue Swapped =
2624 getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
2625 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
2626 }
2627 case MVT::i64: {
2628 SDValue UnpackSrc =
2629 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
2630 SDValue SwappedLow =
2631 getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2632 DL, DAG);
2633 SDValue SwappedHigh =
2634 getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2635 DL, DAG);
2636 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
2637 {SwappedHigh, SwappedLow});
2638 }
2639 default:
2640 llvm_unreachable("unsupported type for bswap");
2641 }
2642}
2643
2644static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2645 switch (IID) {
2646 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2647 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1;
2648 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2649 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2;
2650 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2651 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2652 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2653 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2654 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2655 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2656 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2657 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2658 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2659 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2660 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2661 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2662 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2663 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2664 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2665 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2666 case Intrinsic::
2667 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2668 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2669 case Intrinsic::
2670 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2671 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2672 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2673 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1;
2674 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2675 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2;
2676 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2677 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2678 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2679 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2680 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2681 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2682 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2683 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2684 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2685 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2686 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2687 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2688 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2689 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2690 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2691 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2692 case Intrinsic::
2693 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2694 return NVPTXISD::
2695 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2696 case Intrinsic::
2697 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2698 return NVPTXISD::
2699 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2700 };
2701 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2702}
2703
2705 SDNode *N = Op.getNode();
2706 SDLoc DL(N);
2707 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2708
2710 // split the vector argument
2711 for (size_t I = 0; I < N->getNumOperands(); I++) {
2712 if (I == 1)
2713 continue; // skip IID
2714 SDValue Val = N->getOperand(I);
2715 EVT ValVT = Val.getValueType();
2716 if (ValVT.isVector()) {
2717 EVT EltVT = ValVT.getVectorElementType();
2718 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2719 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2720 DAG.getIntPtrConstant(J, DL)));
2721 } else
2722 Ops.push_back(Val);
2723 }
2724
2726 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2727 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2728 MemSD->getMemoryVT(), MemSD->getMemOperand());
2729
2730 return Tcgen05MMANode;
2731}
2732
2733// Lower vector return type of tcgen05.ld intrinsics
2734static std::optional<std::pair<SDValue, SDValue>>
2735lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2736 SDLoc DL(N);
2737 EVT ResVT = N->getValueType(0);
2738 if (!ResVT.isVector())
2739 return {}; // already legalized.
2740
2741 const unsigned NumElts = ResVT.getVectorNumElements();
2742
2743 // Create the return type of the instructions
2744 SmallVector<EVT, 5> ListVTs;
2745 for (unsigned i = 0; i < NumElts; ++i)
2746 ListVTs.push_back(MVT::i32);
2747
2748 ListVTs.push_back(N->getValueType(1)); // Chain
2749
2750 SDVTList ResVTs = DAG.getVTList(ListVTs);
2751
2752 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2753 N->getOperand(2)};
2754
2755 if (HasOffset) {
2756 Ops.push_back(N->getOperand(3)); // offset
2757 Ops.push_back(N->getOperand(4)); // Pack flag
2758 } else
2759 Ops.push_back(N->getOperand(3)); // Pack flag
2760
2762 SDValue NewNode =
2764 MemSD->getMemoryVT(), MemSD->getMemOperand());
2765
2766 // split the vector result
2767 SmallVector<SDValue, 4> ScalarRes;
2768 for (unsigned i = 0; i < NumElts; ++i) {
2769 SDValue Res = NewNode.getValue(i);
2770 ScalarRes.push_back(Res);
2771 }
2772
2773 SDValue Chain = NewNode.getValue(NumElts);
2774 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2775 return {{BuildVector, Chain}};
2776}
2777
2779 unsigned Val) {
2780 SDNode *N = Op.getNode();
2781 SDLoc DL(N);
2782
2783 const Function &Fn = DAG.getMachineFunction().getFunction();
2784
2785 unsigned AS = 0;
2786 if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N))
2787 AS = MemN->getAddressSpace();
2788 Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
2790
2792 Fn,
2793 "Intrinsic " +
2794 Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
2795 " with value " + Twine(Val) +
2796 " is not supported on the given target.",
2797 DL.getDebugLoc()));
2798 return Op.getOperand(0);
2799}
2800
2802 SDNode *N = Op.getNode();
2803 SDLoc DL(N);
2804
2805 // immediate argument representing elemtype
2806 unsigned Val = N->getConstantOperandVal(3);
2807
2809 Val))
2810 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2811
2812 return Op;
2813}
2814
2816 SDNode *N = Op.getNode();
2817 SDLoc DL(N);
2818
2819 // immediate argument representing swizzle mode
2820 unsigned Val = N->getConstantOperandVal(3);
2821
2823 Val))
2824 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2825
2826 return Op;
2827}
2828
2830 SDNode *N = Op.getNode();
2831 SDValue Intrin = N->getOperand(1);
2832
2833 // Get the intrinsic ID
2834 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2835 switch (IntrinNo) {
2836 default:
2837 break;
2838 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2839 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2840 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2841 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2842 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2843 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2844 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2845 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2846 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2847 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2848 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2849 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2850 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2851 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2852 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2853 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2854 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2855 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2856 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2857 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2858 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2859 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2860 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2861 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2862 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2863 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2864 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2865 return lowerTcgen05St(Op, DAG);
2866 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2867 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2868 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2869 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2870 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2871 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2872 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2873 return lowerTcgen05St(Op, DAG, /* hasOffset */ true);
2874 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2875 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2876 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2877 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2878 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2879 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2880 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2881 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2882 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2883 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2884 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2885 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2886 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2887 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2888 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2889 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2890 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2891 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2892 case Intrinsic::
2893 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2894 case Intrinsic::
2895 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2896 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2897 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2898 case Intrinsic::
2899 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2900 case Intrinsic::
2901 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2903 case Intrinsic::nvvm_tensormap_replace_elemtype:
2904 return lowerTensormapReplaceElemtype(Op, DAG);
2905 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
2907 }
2908 return Op;
2909}
2910
2912 SelectionDAG &DAG) {
2913
2914 SDNode *N = Op.getNode();
2915 if (N->getOperand(1).getValueType() != MVT::i128) {
2916 // return, if the operand is already lowered
2917 return SDValue();
2918 }
2919
2920 unsigned IID =
2921 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2922 auto Opcode = [&]() {
2923 switch (IID) {
2924 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2925 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED;
2926 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2927 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X;
2928 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2929 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y;
2930 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2931 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z;
2932 default:
2933 llvm_unreachable("unsupported/unhandled intrinsic");
2934 }
2935 }();
2936
2937 SDLoc DL(N);
2938 SDValue TryCancelResponse = N->getOperand(1);
2939 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2940 SDValue TryCancelResponse0 =
2941 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2942 DAG.getIntPtrConstant(0, DL));
2943 SDValue TryCancelResponse1 =
2944 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2945 DAG.getIntPtrConstant(1, DL));
2946
2947 return DAG.getNode(Opcode, DL, N->getVTList(),
2948 {TryCancelResponse0, TryCancelResponse1});
2949}
2950
2952 SDNode *N = Op.getNode();
2953 SDLoc DL(N);
2954 SDValue F32Vec = N->getOperand(1);
2955 SDValue RBits = N->getOperand(2);
2956
2957 unsigned IntrinsicID = N->getConstantOperandVal(0);
2958
2959 // Extract the 4 float elements from the vector
2961 for (unsigned i = 0; i < 4; ++i)
2962 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
2963 DAG.getIntPtrConstant(i, DL)));
2964
2966
2967 auto [OpCode, RetTy, CvtModeFlag] =
2968 [&]() -> std::tuple<unsigned, MVT::SimpleValueType, uint32_t> {
2969 switch (IntrinsicID) {
2970 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
2971 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
2972 CvtMode::RS | CvtMode::RELU_FLAG};
2973 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
2974 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2975 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
2976 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
2977 CvtMode::RS | CvtMode::RELU_FLAG};
2978 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
2979 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2980 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
2981 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
2982 CvtMode::RS | CvtMode::RELU_FLAG};
2983 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
2984 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2985 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
2986 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
2987 CvtMode::RS | CvtMode::RELU_FLAG};
2988 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
2989 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2990 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
2991 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
2992 CvtMode::RS | CvtMode::RELU_FLAG};
2993 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
2994 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
2995 default:
2996 llvm_unreachable("unsupported/unhandled intrinsic");
2997 }
2998 }();
2999
3000 Ops.push_back(RBits);
3001 Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
3002
3003 return DAG.getNode(OpCode, DL, RetTy, Ops);
3004}
3005
3007 const unsigned Mode = [&]() {
3008 switch (Op->getConstantOperandVal(0)) {
3009 case Intrinsic::nvvm_prmt:
3011 case Intrinsic::nvvm_prmt_b4e:
3013 case Intrinsic::nvvm_prmt_ecl:
3015 case Intrinsic::nvvm_prmt_ecr:
3017 case Intrinsic::nvvm_prmt_f4e:
3019 case Intrinsic::nvvm_prmt_rc16:
3021 case Intrinsic::nvvm_prmt_rc8:
3023 default:
3024 llvm_unreachable("unsupported/unhandled intrinsic");
3025 }
3026 }();
3027 SDLoc DL(Op);
3028 SDValue A = Op->getOperand(1);
3029 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
3030 : DAG.getConstant(0, DL, MVT::i32);
3031 SDValue Selector = (Op->op_end() - 1)->get();
3032 return getPRMT(A, B, Selector, DL, DAG, Mode);
3033}
3034
3035#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE) \
3036 Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_x##NUM##_##TYPE
3037
3038#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE) \
3039 NVPTXISD::TCGEN05_LD_RED_##SHAPE##_X##NUM##_##TYPE
3040
3041static unsigned getTcgen05LdRedID(Intrinsic::ID IID) {
3042 switch (IID) {
3043 case TCGEN05_LD_RED_INTR(32x32b, 2, f32):
3044 return TCGEN05_LD_RED_INST(32x32b, 2, F32);
3045 case TCGEN05_LD_RED_INTR(32x32b, 4, f32):
3046 return TCGEN05_LD_RED_INST(32x32b, 4, F32);
3047 case TCGEN05_LD_RED_INTR(32x32b, 8, f32):
3048 return TCGEN05_LD_RED_INST(32x32b, 8, F32);
3049 case TCGEN05_LD_RED_INTR(32x32b, 16, f32):
3050 return TCGEN05_LD_RED_INST(32x32b, 16, F32);
3051 case TCGEN05_LD_RED_INTR(32x32b, 32, f32):
3052 return TCGEN05_LD_RED_INST(32x32b, 32, F32);
3053 case TCGEN05_LD_RED_INTR(32x32b, 64, f32):
3054 return TCGEN05_LD_RED_INST(32x32b, 64, F32);
3055 case TCGEN05_LD_RED_INTR(32x32b, 128, f32):
3056 return TCGEN05_LD_RED_INST(32x32b, 128, F32);
3057 case TCGEN05_LD_RED_INTR(16x32bx2, 2, f32):
3058 return TCGEN05_LD_RED_INST(16x32bx2, 2, F32);
3059 case TCGEN05_LD_RED_INTR(16x32bx2, 4, f32):
3060 return TCGEN05_LD_RED_INST(16x32bx2, 4, F32);
3061 case TCGEN05_LD_RED_INTR(16x32bx2, 8, f32):
3062 return TCGEN05_LD_RED_INST(16x32bx2, 8, F32);
3063 case TCGEN05_LD_RED_INTR(16x32bx2, 16, f32):
3064 return TCGEN05_LD_RED_INST(16x32bx2, 16, F32);
3065 case TCGEN05_LD_RED_INTR(16x32bx2, 32, f32):
3066 return TCGEN05_LD_RED_INST(16x32bx2, 32, F32);
3067 case TCGEN05_LD_RED_INTR(16x32bx2, 64, f32):
3068 return TCGEN05_LD_RED_INST(16x32bx2, 64, F32);
3069 case TCGEN05_LD_RED_INTR(16x32bx2, 128, f32):
3070 return TCGEN05_LD_RED_INST(16x32bx2, 128, F32);
3071 case TCGEN05_LD_RED_INTR(32x32b, 2, i32):
3072 return TCGEN05_LD_RED_INST(32x32b, 2, I32);
3073 case TCGEN05_LD_RED_INTR(32x32b, 4, i32):
3074 return TCGEN05_LD_RED_INST(32x32b, 4, I32);
3075 case TCGEN05_LD_RED_INTR(32x32b, 8, i32):
3076 return TCGEN05_LD_RED_INST(32x32b, 8, I32);
3077 case TCGEN05_LD_RED_INTR(32x32b, 16, i32):
3078 return TCGEN05_LD_RED_INST(32x32b, 16, I32);
3079 case TCGEN05_LD_RED_INTR(32x32b, 32, i32):
3080 return TCGEN05_LD_RED_INST(32x32b, 32, I32);
3081 case TCGEN05_LD_RED_INTR(32x32b, 64, i32):
3082 return TCGEN05_LD_RED_INST(32x32b, 64, I32);
3083 case TCGEN05_LD_RED_INTR(32x32b, 128, i32):
3084 return TCGEN05_LD_RED_INST(32x32b, 128, I32);
3085 case TCGEN05_LD_RED_INTR(16x32bx2, 2, i32):
3086 return TCGEN05_LD_RED_INST(16x32bx2, 2, I32);
3087 case TCGEN05_LD_RED_INTR(16x32bx2, 4, i32):
3088 return TCGEN05_LD_RED_INST(16x32bx2, 4, I32);
3089 case TCGEN05_LD_RED_INTR(16x32bx2, 8, i32):
3090 return TCGEN05_LD_RED_INST(16x32bx2, 8, I32);
3091 case TCGEN05_LD_RED_INTR(16x32bx2, 16, i32):
3092 return TCGEN05_LD_RED_INST(16x32bx2, 16, I32);
3093 case TCGEN05_LD_RED_INTR(16x32bx2, 32, i32):
3094 return TCGEN05_LD_RED_INST(16x32bx2, 32, I32);
3095 case TCGEN05_LD_RED_INTR(16x32bx2, 64, i32):
3096 return TCGEN05_LD_RED_INST(16x32bx2, 64, I32);
3097 case TCGEN05_LD_RED_INTR(16x32bx2, 128, i32):
3098 return TCGEN05_LD_RED_INST(16x32bx2, 128, I32);
3099 default:
3100 llvm_unreachable("Invalid tcgen05.ld.red intrinsic ID");
3101 }
3102}
3103
3104// Lower vector return type of tcgen05.ld intrinsics
3105static std::optional<std::tuple<SDValue, SDValue, SDValue>>
3107 SDLoc DL(N);
3108 EVT ResVT = N->getValueType(0);
3109 if (!ResVT.isVector())
3110 return {}; // already legalized.
3111
3112 const unsigned NumElts = ResVT.getVectorNumElements();
3113
3114 // Create the return type of the instructions
3115 // +1 represents the reduction value
3116 SmallVector<EVT, 132> ListVTs{
3117 NumElts + 1,
3118 ResVT.getVectorElementType().isFloatingPoint() ? MVT::f32 : MVT::i32};
3119
3120 ListVTs.push_back(MVT::Other); // Chain
3121
3122 SDVTList ResVTs = DAG.getVTList(ListVTs);
3123
3124 // Prepare the Operands
3125 SmallVector<SDValue, 8> Ops{N->getOperand(0)}; // Chain
3126
3127 // skip IID at index 1
3128 for (unsigned i = 2; i < N->getNumOperands(); i++)
3129 Ops.push_back(N->getOperand(i));
3130
3131 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
3133 SDValue NewNode =
3134 DAG.getMemIntrinsicNode(getTcgen05LdRedID(IID), DL, ResVTs, Ops,
3135 MemSD->getMemoryVT(), MemSD->getMemOperand());
3136
3137 // Split vector result
3138 SmallVector<SDValue, 132> ScalarRes;
3139 for (unsigned i = 0; i < NumElts; ++i) {
3140 SDValue Res = NewNode.getValue(i);
3141 ScalarRes.push_back(Res);
3142 }
3143
3144 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
3145 SDValue RedResult = NewNode.getValue(NumElts);
3146 SDValue Chain = NewNode.getValue(NumElts + 1);
3147 return {{BuildVector, RedResult, Chain}};
3148}
3149
3151 switch (Op->getConstantOperandVal(1)) {
3152 default:
3153 return Op;
3154
3155 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
3156 // lower them through LowerOperation() instead of ReplaceNodeResults().
3157 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
3158 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
3159 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
3160 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
3161 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3162 return SDValue();
3163
3164 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
3165 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
3166 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3167 return SDValue();
3168
3169 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
3170 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
3171 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32:
3172 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32:
3173 if (auto Res = lowerTcgen05LdRed(Op.getNode(), DAG))
3174 return DAG.getMergeValues(
3175 {std::get<0>(*Res), std::get<1>(*Res), std::get<2>(*Res)}, SDLoc(Op));
3176 return SDValue();
3177 }
3178}
3179
3181 switch (Op->getConstantOperandVal(0)) {
3182 default:
3183 return Op;
3184 case Intrinsic::nvvm_prmt:
3185 case Intrinsic::nvvm_prmt_b4e:
3186 case Intrinsic::nvvm_prmt_ecl:
3187 case Intrinsic::nvvm_prmt_ecr:
3188 case Intrinsic::nvvm_prmt_f4e:
3189 case Intrinsic::nvvm_prmt_rc16:
3190 case Intrinsic::nvvm_prmt_rc8:
3191 return lowerPrmtIntrinsic(Op, DAG);
3192 case Intrinsic::nvvm_internal_addrspace_wrap:
3193 return Op.getOperand(1);
3194 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
3195 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
3196 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
3197 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
3199 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
3200 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
3201 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
3202 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
3203 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
3204 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
3205 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
3206 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
3207 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
3208 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
3209 return lowerCvtRSIntrinsics(Op, DAG);
3210 }
3211}
3212
3213// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
3214// Lower these into a node returning the correct type which is zero-extended
3215// back to the correct size.
3217 SDValue V = Op->getOperand(0);
3218 assert(V.getValueType() == MVT::i64 &&
3219 "Unexpected CTLZ/CTPOP type to legalize");
3220
3221 SDLoc DL(Op);
3222 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
3223 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
3224}
3225
3227 unsigned Opcode, SelectionDAG &DAG) {
3228 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
3229
3230 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
3231 if (!AmtConst)
3232 return SDValue();
3233 const auto Amt = AmtConst->getZExtValue() & 63;
3234
3235 SDValue UnpackA =
3236 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
3237 SDValue UnpackB =
3238 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
3239
3240 // Arch is Little endiain: 0 = low bits, 1 = high bits
3241 SDValue ALo = UnpackA.getValue(0);
3242 SDValue AHi = UnpackA.getValue(1);
3243 SDValue BLo = UnpackB.getValue(0);
3244 SDValue BHi = UnpackB.getValue(1);
3245
3246 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3247 //
3248 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3249 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3250 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3251 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3252 //
3253 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3254 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3255 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3256 // move to select and arrange the 32bit values. For simplicity, these cases
3257 // are not handled here explicitly and instead we rely on DAGCombiner to
3258 // remove the no-op funnel shifts we insert.
3259 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3260 ? std::make_tuple(AHi, ALo, BHi)
3261 : std::make_tuple(ALo, BHi, BLo);
3262
3263 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3264 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3265 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3266
3267 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3268}
3269
3271 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3272 SDLoc(Op), Op->getOpcode(), DAG);
3273}
3274
3276 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3277 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3278 SDLoc(Op), Opcode, DAG);
3279}
3280
3282 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3283 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3284 // the semantics of LLVM's frem.
3285 SDLoc DL(Op);
3286 SDValue X = Op->getOperand(0);
3287 SDValue Y = Op->getOperand(1);
3288 EVT Ty = Op.getValueType();
3289 SDNodeFlags Flags = Op->getFlags();
3290
3291 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3292 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3293 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3295 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3297
3298 if (Flags.hasNoInfs())
3299 return Sub;
3300
3301 // If Y is infinite, return X
3302 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3303 SDValue Inf =
3304 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3305 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3306 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3307}
3308
3310 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3311
3312 SDValue Cond = Op->getOperand(0);
3313 SDValue TrueVal = Op->getOperand(1);
3314 SDValue FalseVal = Op->getOperand(2);
3315 SDLoc DL(Op);
3316
3317 // If both operands are truncated, we push the select through the truncates.
3318 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3319 FalseVal.getOpcode() == ISD::TRUNCATE) {
3320 TrueVal = TrueVal.getOperand(0);
3321 FalseVal = FalseVal.getOperand(0);
3322
3323 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3324 ? TrueVal.getValueType()
3325 : FalseVal.getValueType();
3326 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3327 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3328 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3329 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3330 }
3331
3332 // Otherwise, expand the select into a series of logical operations. These
3333 // often can be folded into other operations either by us or ptxas.
3334 TrueVal = DAG.getFreeze(TrueVal);
3335 FalseVal = DAG.getFreeze(FalseVal);
3336 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3337 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3338 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3339 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3340 return Or;
3341}
3342
3344 SDNode *N = Op.getNode();
3345
3346 SDValue Chain = N->getOperand(0);
3347 SDValue Val = N->getOperand(1);
3348 SDValue BasePtr = N->getOperand(2);
3349 SDValue Offset = N->getOperand(3);
3350 SDValue Mask = N->getOperand(4);
3351
3352 SDLoc DL(N);
3353 EVT ValVT = Val.getValueType();
3354 MemSDNode *MemSD = cast<MemSDNode>(N);
3355 assert(ValVT.isVector() && "Masked vector store must have vector type");
3356 assert(MemSD->getAlign() >= DAG.getEVTAlign(ValVT) &&
3357 "Unexpected alignment for masked store");
3358
3359 unsigned Opcode = 0;
3360 switch (ValVT.getSimpleVT().SimpleTy) {
3361 default:
3362 llvm_unreachable("Unexpected masked vector store type");
3363 case MVT::v4i64:
3364 case MVT::v4f64: {
3365 Opcode = NVPTXISD::StoreV4;
3366 break;
3367 }
3368 case MVT::v8i32:
3369 case MVT::v8f32: {
3370 Opcode = NVPTXISD::StoreV8;
3371 break;
3372 }
3373 }
3374
3376
3377 // Construct the new SDNode. First operand is the chain.
3378 Ops.push_back(Chain);
3379
3380 // The next N operands are the values to store. Encode the mask into the
3381 // values using the sentinel register 0 to represent a masked-off element.
3382 assert(Mask.getValueType().isVector() &&
3383 Mask.getValueType().getVectorElementType() == MVT::i1 &&
3384 "Mask must be a vector of i1");
3385 assert(Mask.getOpcode() == ISD::BUILD_VECTOR &&
3386 "Mask expected to be a BUILD_VECTOR");
3387 assert(Mask.getValueType().getVectorNumElements() ==
3388 ValVT.getVectorNumElements() &&
3389 "Mask size must be the same as the vector size");
3390 for (auto [I, Op] : enumerate(Mask->ops())) {
3391 // Mask elements must be constants.
3392 if (Op.getNode()->getAsZExtVal() == 0) {
3393 // Append a sentinel register 0 to the Ops vector to represent a masked
3394 // off element, this will be handled in tablegen
3396 ValVT.getVectorElementType()));
3397 } else {
3398 // Extract the element from the vector to store
3399 SDValue ExtVal =
3401 Val, DAG.getIntPtrConstant(I, DL));
3402 Ops.push_back(ExtVal);
3403 }
3404 }
3405
3406 // Next, the pointer operand.
3407 Ops.push_back(BasePtr);
3408
3409 // Finally, the offset operand. We expect this to always be undef, and it will
3410 // be ignored in lowering, but to mirror the handling of the other vector
3411 // store instructions we include it in the new SDNode.
3412 assert(Offset.getOpcode() == ISD::UNDEF &&
3413 "Offset operand expected to be undef");
3414 Ops.push_back(Offset);
3415
3416 SDValue NewSt =
3417 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3418 MemSD->getMemoryVT(), MemSD->getMemOperand());
3419
3420 return NewSt;
3421}
3422
3423SDValue
3425 switch (Op.getOpcode()) {
3426 case ISD::RETURNADDR:
3427 return SDValue();
3428 case ISD::FRAMEADDR:
3429 return SDValue();
3430 case ISD::ADDRSPACECAST:
3431 return LowerADDRSPACECAST(Op, DAG);
3433 return lowerIntrinsicWChain(Op, DAG);
3435 return lowerIntrinsicWOChain(Op, DAG);
3437 return lowerIntrinsicVoid(Op, DAG);
3438 case ISD::BUILD_VECTOR:
3439 return LowerBUILD_VECTOR(Op, DAG);
3440 case ISD::BITCAST:
3441 return LowerBITCAST(Op, DAG);
3443 return Op;
3445 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3447 return LowerINSERT_VECTOR_ELT(Op, DAG);
3449 return LowerVECTOR_SHUFFLE(Op, DAG);
3451 return LowerCONCAT_VECTORS(Op, DAG);
3456 return LowerVECREDUCE(Op, DAG);
3457 case ISD::STORE:
3458 return LowerSTORE(Op, DAG);
3459 case ISD::MSTORE: {
3460 assert(STI.has256BitVectorLoadStore(
3461 cast<MemSDNode>(Op.getNode())->getAddressSpace()) &&
3462 "Masked store vector not supported on subtarget.");
3463 return lowerMSTORE(Op, DAG);
3464 }
3465 case ISD::LOAD:
3466 return LowerLOAD(Op, DAG);
3467 case ISD::MLOAD:
3468 return LowerMLOAD(Op, DAG);
3469 case ISD::SHL_PARTS:
3470 return LowerShiftLeftParts(Op, DAG);
3471 case ISD::SRA_PARTS:
3472 case ISD::SRL_PARTS:
3473 return LowerShiftRightParts(Op, DAG);
3474 case ISD::SELECT:
3475 return lowerSELECT(Op, DAG);
3476 case ISD::FROUND:
3477 return LowerFROUND(Op, DAG);
3478 case ISD::FCOPYSIGN:
3479 return LowerFCOPYSIGN(Op, DAG);
3480 case ISD::SINT_TO_FP:
3481 case ISD::UINT_TO_FP:
3482 return LowerINT_TO_FP(Op, DAG);
3483 case ISD::FP_TO_SINT:
3484 case ISD::FP_TO_UINT:
3485 // fptosi/fptoui to i1 truncate toward zero, so the only defined results
3486 // are {0,-1} (signed) and {0,1} (unsigned); every other input results in
3487 // poison. Thus we can simply lower to `x <= -1.0` or `x >= 1.0`.
3488 if (Op.getValueType() == MVT::i1) {
3489 SDLoc DL(Op);
3490 SDValue X = Op.getOperand(0);
3491 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
3492 return DAG.getSetCC(
3493 DL, MVT::i1, X,
3494 DAG.getConstantFP(IsSigned ? -1.0 : 1.0, DL, X.getValueType()),
3495 IsSigned ? ISD::SETOLE : ISD::SETOGE);
3496 }
3497 return LowerFP_TO_INT(Op, DAG);
3498 case ISD::FP_ROUND:
3499 return LowerFP_ROUND(Op, DAG);
3500 case ISD::FP_EXTEND:
3501 return LowerFP_EXTEND(Op, DAG);
3502 case ISD::VAARG:
3503 return LowerVAARG(Op, DAG);
3504 case ISD::VASTART:
3505 return LowerVASTART(Op, DAG);
3506 case ISD::FSHL:
3507 case ISD::FSHR:
3508 return lowerFSH(Op, DAG);
3509 case ISD::ROTL:
3510 case ISD::ROTR:
3511 return lowerROT(Op, DAG);
3512 case ISD::ABS:
3514 case ISD::SMIN:
3515 case ISD::SMAX:
3516 case ISD::UMIN:
3517 case ISD::UMAX:
3518 case ISD::ADD:
3519 case ISD::SUB:
3520 case ISD::MUL:
3521 case ISD::SHL:
3522 case ISD::SREM:
3523 case ISD::UREM:
3524 return LowerVectorArith(Op, DAG);
3526 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3527 case ISD::STACKRESTORE:
3528 return LowerSTACKRESTORE(Op, DAG);
3529 case ISD::STACKSAVE:
3530 return LowerSTACKSAVE(Op, DAG);
3531 case ISD::CopyToReg:
3532 return LowerCopyToReg_128(Op, DAG);
3533 case ISD::FADD:
3534 case ISD::FSUB:
3535 case ISD::FMUL:
3536 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3537 return PromoteBinOpIfF32FTZ(Op, DAG);
3538 case ISD::CTPOP:
3539 case ISD::CTLZ:
3540 return lowerCTLZCTPOP(Op, DAG);
3541 case ISD::FREM:
3542 return lowerFREM(Op, DAG);
3543 case ISD::BSWAP:
3544 return lowerBSWAP(Op, DAG);
3545 default:
3546 llvm_unreachable("Custom lowering not defined for operation");
3547 }
3548}
3549
3550// This will prevent AsmPrinter from trying to print the jump tables itself.
3554
3555SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3556 SelectionDAG &DAG) const {
3558 unsigned SrcAS = N->getSrcAddressSpace();
3559 unsigned DestAS = N->getDestAddressSpace();
3560 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3561 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3562 // Shared and SharedCluster can be converted to each other through generic
3563 // space
3564 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3567 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3568 SDLoc DL(Op.getNode());
3569 const MVT GenerictVT =
3571 SDValue GenericConversion = DAG.getAddrSpaceCast(
3572 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3573 SDValue SharedClusterConversion =
3574 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3575 ADDRESS_SPACE_GENERIC, DestAS);
3576 return SharedClusterConversion;
3577 }
3578
3579 return DAG.getUNDEF(Op.getValueType());
3580 }
3581
3582 return Op;
3583}
3584
3585// This function is almost a copy of SelectionDAG::expandVAArg().
3586// The only diff is that this one produces loads from local address space.
3587SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3588 const TargetLowering *TLI = STI.getTargetLowering();
3589 SDLoc DL(Op);
3590
3591 SDNode *Node = Op.getNode();
3592 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3593 EVT VT = Node->getValueType(0);
3594 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3595 SDValue Tmp1 = Node->getOperand(0);
3596 SDValue Tmp2 = Node->getOperand(1);
3597 const MaybeAlign MA(Node->getConstantOperandVal(3));
3598
3599 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3600 Tmp1, Tmp2, MachinePointerInfo(V));
3601 SDValue VAList = VAListLoad;
3602
3603 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3604 VAList = DAG.getNode(
3605 ISD::ADD, DL, VAList.getValueType(), VAList,
3606 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3607
3608 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3609 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3610 VAList.getValueType()));
3611 }
3612
3613 // Increment the pointer, VAList, to the next vaarg
3614 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3616 DL, VAList.getValueType()));
3617
3618 // Store the incremented VAList to the legalized pointer
3619 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3620 MachinePointerInfo(V));
3621
3622 const Value *SrcV = Constant::getNullValue(
3624
3625 // Load the actual argument out of the pointer VAList
3626 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3627}
3628
3629SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3630 const TargetLowering *TLI = STI.getTargetLowering();
3631 SDLoc DL(Op);
3632 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3633
3634 // Store the address of unsized array <function>_vararg[] in the ap object.
3635 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3636
3637 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3638 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3639 MachinePointerInfo(SV));
3640}
3641
3642static std::pair<MemSDNode *, uint32_t>
3644 const NVPTXSubtarget &STI) {
3645 SDValue Chain = N->getOperand(0);
3646 SDValue BasePtr = N->getOperand(1);
3647 SDValue Mask = N->getOperand(3);
3648 [[maybe_unused]] SDValue Passthru = N->getOperand(4);
3649
3650 SDLoc DL(N);
3651 EVT ResVT = N->getValueType(0);
3652 assert(ResVT.isVector() && "Masked vector load must have vector type");
3653 // While we only expect poison passthru vectors as an input to the backend,
3654 // when the legalization framework splits a poison vector in half, it creates
3655 // two undef vectors, so we can technically expect those too.
3656 assert((Passthru.getOpcode() == ISD::POISON ||
3657 Passthru.getOpcode() == ISD::UNDEF) &&
3658 "Passthru operand expected to be poison or undef");
3659
3660 // Extract the mask and convert it to a uint32_t representing the used bytes
3661 // of the entire vector load
3662 uint32_t UsedBytesMask = 0;
3663 uint32_t ElementSizeInBits = ResVT.getVectorElementType().getSizeInBits();
3664 assert(ElementSizeInBits % 8 == 0 && "Unexpected element size");
3665 uint32_t ElementSizeInBytes = ElementSizeInBits / 8;
3666 uint32_t ElementMask = (1u << ElementSizeInBytes) - 1u;
3667
3668 for (SDValue Op : reverse(Mask->ops())) {
3669 // We technically only want to do this shift for every
3670 // iteration *but* the first, but in the first iteration UsedBytesMask is 0,
3671 // so this shift is a no-op.
3672 UsedBytesMask <<= ElementSizeInBytes;
3673
3674 // Mask elements must be constants.
3675 if (Op->getAsZExtVal() != 0)
3676 UsedBytesMask |= ElementMask;
3677 }
3678
3679 assert(UsedBytesMask != 0 && UsedBytesMask != UINT32_MAX &&
3680 "Unexpected masked load with elements masked all on or all off");
3681
3682 // Create a new load sd node to be handled normally by ReplaceLoadVector.
3683 MemSDNode *NewLD = cast<MemSDNode>(
3684 DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());
3685
3686 // If our subtarget does not support the used bytes mask pragma, "drop" the
3687 // mask by setting it to UINT32_MAX
3688 if (!STI.hasUsedBytesMaskPragma())
3689 UsedBytesMask = UINT32_MAX;
3690
3691 return {NewLD, UsedBytesMask};
3692}
3693
3694/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3695static std::optional<std::pair<SDValue, SDValue>>
3698 const EVT ResVT = LD->getValueType(0);
3699 const EVT MemVT = LD->getMemoryVT();
3700
3701 // If we're doing sign/zero extension as part of the load, avoid lowering to
3702 // a LoadV node. TODO: consider relaxing this restriction.
3703 if (ResVT != MemVT)
3704 return std::nullopt;
3705
3706 const auto NumEltsAndEltVT =
3707 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3708 if (!NumEltsAndEltVT)
3709 return std::nullopt;
3710 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3711
3712 Align Alignment = LD->getAlign();
3713 const auto &TD = DAG.getDataLayout();
3714 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3715 if (Alignment < PrefAlign) {
3716 // This load is not sufficiently aligned, so bail out and let this vector
3717 // load be scalarized. Note that we may still be able to emit smaller
3718 // vector loads. For example, if we are loading a <4 x float> with an
3719 // alignment of 8, this check will fail but the legalizer will try again
3720 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3721 return std::nullopt;
3722 }
3723
3724 // If we have a masked load, convert it to a normal load now
3725 std::optional<uint32_t> UsedBytesMask = std::nullopt;
3726 if (LD->getOpcode() == ISD::MLOAD)
3727 std::tie(LD, UsedBytesMask) =
3729
3730 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3731 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3732 // loaded type to i16 and propagate the "real" type as the memory type.
3733 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3734
3735 unsigned Opcode;
3736 switch (NumElts) {
3737 default:
3738 return std::nullopt;
3739 case 2:
3740 Opcode = NVPTXISD::LoadV2;
3741 break;
3742 case 4:
3743 Opcode = NVPTXISD::LoadV4;
3744 break;
3745 case 8:
3746 Opcode = NVPTXISD::LoadV8;
3747 break;
3748 }
3749 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3750 ListVTs.push_back(MVT::Other);
3751 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3752
3753 SDLoc DL(LD);
3754
3755 // Copy regular operands
3756 SmallVector<SDValue, 8> OtherOps(LD->ops());
3757
3758 OtherOps.push_back(
3759 DAG.getConstant(UsedBytesMask.value_or(UINT32_MAX), DL, MVT::i32));
3760
3761 // The select routine does not have access to the LoadSDNode instance, so
3762 // pass along the extension information
3763 OtherOps.push_back(
3764 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3765
3766 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3767 LD->getMemOperand());
3768
3769 SmallVector<SDValue> ScalarRes;
3770 if (EltVT.isVector()) {
3772 assert(NumElts * EltVT.getVectorNumElements() ==
3773 ResVT.getVectorNumElements());
3774 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3775 // into individual elements.
3776 for (const unsigned I : llvm::seq(NumElts)) {
3777 SDValue SubVector = NewLD.getValue(I);
3778 DAG.ExtractVectorElements(SubVector, ScalarRes);
3779 }
3780 } else {
3781 for (const unsigned I : llvm::seq(NumElts)) {
3782 SDValue Res = NewLD.getValue(I);
3783 if (LoadEltVT != EltVT)
3784 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3785 ScalarRes.push_back(Res);
3786 }
3787 }
3788
3789 SDValue LoadChain = NewLD.getValue(NumElts);
3790
3791 const MVT BuildVecVT =
3792 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3793 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3794 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3795
3796 return {{LoadValue, LoadChain}};
3797}
3798
3801 const NVPTXSubtarget &STI) {
3802 if (auto Res = replaceLoadVector(N, DAG, STI))
3803 Results.append({Res->first, Res->second});
3804}
3805
3807 const NVPTXSubtarget &STI) {
3808 if (auto Res = replaceLoadVector(N, DAG, STI))
3809 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3810 return SDValue();
3811}
3812
3813// v = ld i1* addr
3814// =>
3815// v1 = ld i8* addr (-> i16)
3816// v = trunc i16 to i1
3818 SDLoc dl(LD);
3819 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3820 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3821 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3822 LD->getBasePtr(), LD->getPointerInfo(),
3823 MVT::i8, LD->getAlign(),
3824 LD->getMemOperand()->getFlags());
3825 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3826 // The legalizer (the caller) is expecting two values from the legalized
3827 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3828 // in LegalizeDAG.cpp which also uses MergeValues.
3829 return DAG.getMergeValues({result, LD->getChain()}, dl);
3830}
3831
3832SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3833 LoadSDNode *LD = cast<LoadSDNode>(Op);
3834
3835 if (Op.getValueType() == MVT::i1)
3836 return lowerLOADi1(LD, DAG);
3837
3838 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3839 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3840 // we allow for more DAG combine opportunities.
3841 if (LD->getExtensionType() == ISD::EXTLOAD) {
3842 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3843 "Unexpected fpext-load");
3844 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3845 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3846 LD->getMemOperand());
3847 }
3848
3849 llvm_unreachable("Unexpected custom lowering for load");
3850}
3851
3852SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
3853 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
3854 // masked loads of these types and have to handle them here.
3855 // v2f32 also needs to be handled here if the subtarget has f32x2
3856 // instructions, making it legal.
3857 //
3858 // Note: misaligned masked loads should never reach this point
3859 // because the override of isLegalMaskedLoad in NVPTXTargetTransformInfo.cpp
3860 // will validate alignment. Therefore, we do not need to special case handle
3861 // them here.
3862 EVT VT = Op.getValueType();
3863 if (NVPTX::isPackedVectorTy(VT)) {
3865 cast<MemSDNode>(Op.getNode()), DAG, STI);
3866 MemSDNode *LD = std::get<0>(Result);
3867 uint32_t UsedBytesMask = std::get<1>(Result);
3868
3869 SDLoc DL(LD);
3870
3871 // Copy regular operands
3872 SmallVector<SDValue, 8> OtherOps(LD->ops());
3873
3874 OtherOps.push_back(DAG.getConstant(UsedBytesMask, DL, MVT::i32));
3875
3876 // We currently are not lowering extending loads, but pass the extension
3877 // type anyway as later handling expects it.
3878 OtherOps.push_back(
3879 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3880 SDValue NewLD =
3881 DAG.getMemIntrinsicNode(NVPTXISD::MLoad, DL, LD->getVTList(), OtherOps,
3882 LD->getMemoryVT(), LD->getMemOperand());
3883 return NewLD;
3884 }
3885 return SDValue();
3886}
3887
3889 const NVPTXSubtarget &STI) {
3890 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3891 SDValue Val = N->getOperand(1);
3892 SDLoc DL(N);
3893 const EVT ValVT = Val.getValueType();
3894 const EVT MemVT = N->getMemoryVT();
3895
3896 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3897 // TODO: consider relaxing this restriction.
3898 if (ValVT != MemVT)
3899 return SDValue();
3900
3901 const auto NumEltsAndEltVT =
3902 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3903 if (!NumEltsAndEltVT)
3904 return SDValue();
3905 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3906
3907 const DataLayout &TD = DAG.getDataLayout();
3908
3909 Align Alignment = N->getAlign();
3910 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3911 if (Alignment < PrefAlign) {
3912 // This store is not sufficiently aligned, so bail out and let this vector
3913 // store be scalarized. Note that we may still be able to emit smaller
3914 // vector stores. For example, if we are storing a <4 x float> with an
3915 // alignment of 8, this check will fail but the legalizer will try again
3916 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3917 return SDValue();
3918 }
3919
3920 unsigned Opcode;
3921 switch (NumElts) {
3922 default:
3923 return SDValue();
3924 case 2:
3925 Opcode = NVPTXISD::StoreV2;
3926 break;
3927 case 4:
3928 Opcode = NVPTXISD::StoreV4;
3929 break;
3930 case 8:
3931 Opcode = NVPTXISD::StoreV8;
3932 break;
3933 }
3934
3936
3937 // First is the chain
3938 Ops.push_back(N->getOperand(0));
3939
3940 // Then the split values
3941 if (EltVT.isVector()) {
3943 assert(NumElts * EltVT.getVectorNumElements() ==
3944 ValVT.getVectorNumElements());
3945 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3946 // stored as b32s
3947 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3948 for (const unsigned I : llvm::seq(NumElts)) {
3949 SmallVector<SDValue, 4> SubVectorElts;
3950 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3951 NumEltsPerSubVector);
3952 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3953 }
3954 } else {
3955 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3956 for (const unsigned I : llvm::seq(NumElts)) {
3957 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3958 DAG.getIntPtrConstant(I, DL));
3959
3960 // Since StoreV2 is a target node, we cannot rely on DAG type
3961 // legalization. Therefore, we must ensure the type is legal. For i1 and
3962 // i8, we set the stored type to i16 and propagate the "real" type as the
3963 // memory type.
3964 if (EltVT.getSizeInBits() < 16)
3965 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3966 Ops.push_back(ExtVal);
3967 }
3968 }
3969
3970 // Then any remaining arguments
3971 Ops.append(N->op_begin() + 2, N->op_end());
3972
3973 SDValue NewSt =
3974 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3975 N->getMemoryVT(), N->getMemOperand());
3976
3977 // return DCI.CombineTo(N, NewSt, true);
3978 return NewSt;
3979}
3980
3981SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3982 StoreSDNode *Store = cast<StoreSDNode>(Op);
3983 EVT VT = Store->getMemoryVT();
3984
3985 if (VT == MVT::i1)
3986 return LowerSTOREi1(Op, DAG);
3987
3988 // Lower store of any other vector type, including v2f32 as we want to break
3989 // it apart since this is not a widely-supported type.
3990 return lowerSTOREVector(Op, DAG, STI);
3991}
3992
3993// st i1 v, addr
3994// =>
3995// v1 = zxt v to i16
3996// st.u8 i16, addr
3997SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3998 SDNode *Node = Op.getNode();
3999 SDLoc dl(Node);
4000 StoreSDNode *ST = cast<StoreSDNode>(Node);
4001 SDValue Tmp1 = ST->getChain();
4002 SDValue Tmp2 = ST->getBasePtr();
4003 SDValue Tmp3 = ST->getValue();
4004 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
4005 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
4006 SDValue Result =
4007 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
4008 ST->getAlign(), ST->getMemOperand()->getFlags());
4009 return Result;
4010}
4011
4012SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
4013 SelectionDAG &DAG) const {
4014 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
4015 // operand so that it can pass the legalization.
4016
4017 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
4018 "Custom lowering for 128-bit CopyToReg only");
4019
4020 SDNode *Node = Op.getNode();
4021 SDLoc DL(Node);
4022
4023 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
4024 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
4025 DAG.getIntPtrConstant(0, DL));
4026 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
4027 DAG.getIntPtrConstant(1, DL));
4028
4030 SmallVector<EVT, 3> ResultsType(Node->values());
4031
4032 NewOps[0] = Op->getOperand(0); // Chain
4033 NewOps[1] = Op->getOperand(1); // Dst Reg
4034 NewOps[2] = Lo; // Lower 64-bit
4035 NewOps[3] = Hi; // Higher 64-bit
4036 if (Op.getNumOperands() == 4)
4037 NewOps[4] = Op->getOperand(3); // Glue if exists
4038
4039 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
4040}
4041
4042unsigned NVPTXTargetLowering::getNumRegisters(
4043 LLVMContext &Context, EVT VT,
4044 std::optional<MVT> RegisterVT = std::nullopt) const {
4045 if (VT == MVT::i128 && RegisterVT == MVT::i128)
4046 return 1;
4047 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
4048}
4049
4050bool NVPTXTargetLowering::splitValueIntoRegisterParts(
4051 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4052 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4053 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
4054 Parts[0] = Val;
4055 return true;
4056 }
4057 return false;
4058}
4059
4060// This creates target external symbol for a function parameter.
4061// Name of the symbol is composed from its index and the function name.
4062// Negative index corresponds to special parameter (unsized array) used for
4063// passing variable arguments.
4064SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
4065 EVT T) const {
4066 StringRef SavedStr = nvTM->getStrPool().save(
4068 return DAG.getExternalSymbol(SavedStr.data(), T);
4069}
4070
4071SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
4072 EVT T) const {
4073 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
4074 return DAG.getExternalSymbol(SavedStr.data(), T);
4075}
4076
4078 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4079 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4080 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4081 const DataLayout &DL = DAG.getDataLayout();
4082 LLVMContext &Ctx = *DAG.getContext();
4083 auto PtrVT = getPointerTy(DAG.getDataLayout());
4084
4085 const Function &F = DAG.getMachineFunction().getFunction();
4086 const bool IsKernel = isKernelFunction(F);
4087
4088 SDValue Root = DAG.getRoot();
4089 SmallVector<SDValue, 16> OutChains;
4090
4091 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
4092 // Ins.size() will be larger
4093 // * if there is an aggregate argument with multiple fields (each field
4094 // showing up separately in Ins)
4095 // * if there is a vector argument with more than typical vector-length
4096 // elements (generally if more than 4) where each vector element is
4097 // individually present in Ins.
4098 // So a different index should be used for indexing into Ins.
4099 // See similar issue in LowerCall.
4100
4101 auto AllIns = ArrayRef(Ins);
4102 for (const auto &Arg : F.args()) {
4103 const auto ArgIns = AllIns.take_while(
4104 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
4105 AllIns = AllIns.drop_front(ArgIns.size());
4106
4107 Type *Ty = Arg.getType();
4108
4109 if (ArgIns.empty())
4110 report_fatal_error("Empty parameter types are not supported");
4111
4112 if (Arg.use_empty()) {
4113 // argument is dead
4114 for (const auto &In : ArgIns) {
4115 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
4116 InVals.push_back(DAG.getUNDEF(In.VT));
4117 }
4118 continue;
4119 }
4120
4121 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
4122
4123 // In the following cases, assign a node order of "i+1"
4124 // to newly created nodes. The SDNodes for params have to
4125 // appear in the same order as their order of appearance
4126 // in the original function. "i+1" holds that order.
4127 if (Arg.hasByValAttr()) {
4128 // Param has ByVal attribute
4129 // Return MoveParam(param symbol).
4130 // Ideally, the param symbol can be returned directly,
4131 // but when SDNode builder decides to use it in a CopyToReg(),
4132 // machine instruction fails because TargetExternalSymbol
4133 // (not lowered) is target dependent, and CopyToReg assumes
4134 // the source is lowered.
4135 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
4136 const auto &ByvalIn = ArgIns[0];
4137 assert(getValueType(DL, Ty) == ByvalIn.VT &&
4138 "Ins type did not match function type");
4139 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
4140
4141 SDValue P;
4142 if (IsKernel) {
4143 assert(isParamGridConstant(Arg) && "ByVal argument must be lowered to "
4144 "grid_constant by NVPTXLowerArgs");
4145 P = ArgSymbol;
4146 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4147 } else {
4148 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
4149 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4150 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
4152 }
4153 InVals.push_back(P);
4154 } else {
4157 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
4158 assert(VTs.size() == ArgIns.size() && "Size mismatch");
4159 assert(VTs.size() == Offsets.size() && "Size mismatch");
4160
4161 const Align ArgAlign = getFunctionArgumentAlignment(
4162 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
4163
4164 unsigned I = 0;
4165 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
4166 for (const unsigned NumElts : VI) {
4167 // i1 is loaded/stored as i8
4168 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
4169 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
4170
4171 SDValue VecAddr = DAG.getObjectPtrOffset(
4172 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
4173
4174 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
4175 const unsigned AS = IsKernel ? NVPTX::AddressSpace::EntryParam
4177 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
4178 MachinePointerInfo(AS), PartAlign,
4181 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4182 for (const unsigned J : llvm::seq(NumElts)) {
4183 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
4184
4185 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
4186 DAG, dl);
4187 InVals.push_back(Elt);
4188 }
4189 I += NumElts;
4190 }
4191 }
4192 }
4193
4194 if (!OutChains.empty())
4195 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
4196
4197 return Chain;
4198}
4199
4200SDValue
4202 bool isVarArg,
4204 const SmallVectorImpl<SDValue> &OutVals,
4205 const SDLoc &dl, SelectionDAG &DAG) const {
4206 const Function &F = DAG.getMachineFunction().getFunction();
4207 Type *RetTy = F.getReturnType();
4208
4209 if (RetTy->isVoidTy()) {
4210 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
4211 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4212 }
4213
4214 const DataLayout &DL = DAG.getDataLayout();
4215 LLVMContext &Ctx = *DAG.getContext();
4216
4217 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
4218 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
4219
4220 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
4221 // 32-bits are sign extended or zero extended, depending on whether
4222 // they are signed or unsigned types.
4223 const bool ExtendIntegerRetVal =
4224 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
4225
4228 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
4229 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
4230
4231 const auto GetRetVal = [&](unsigned I) -> SDValue {
4232 SDValue RetVal = OutVals[I];
4234 RetVal.getValueType() &&
4235 "OutVal type should always be legal");
4236
4237 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
4238 const EVT StoreVT =
4239 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
4240 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
4241 };
4242
4243 unsigned I = 0;
4244 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
4245 for (const unsigned NumElts : VI) {
4246 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
4247 ? MaybeAlign(std::nullopt)
4248 : commonAlignment(RetAlign, Offsets[I]);
4249
4251 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
4252
4253 SDValue Ptr =
4254 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
4255
4256 Chain = DAG.getStore(Chain, dl, Val, Ptr,
4258 CurrentAlign);
4259
4260 I += NumElts;
4261 }
4262
4263 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4264}
4265
4267 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
4268 SelectionDAG &DAG) const {
4269 if (Constraint.size() > 1)
4270 return;
4272}
4273
4274// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4275// TgtMemIntrinsic
4276// because we need the information that is only available in the "Value" type
4277// of destination
4278// pointer. In particular, the address space information.
4281 MachineFunction &MF, unsigned Intrinsic) const {
4282 IntrinsicInfo Info;
4283 switch (Intrinsic) {
4284 default:
4285 return;
4286 case Intrinsic::nvvm_match_all_sync_i32p:
4287 case Intrinsic::nvvm_match_all_sync_i64p:
4288 Info.opc = ISD::INTRINSIC_W_CHAIN;
4289 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4290 // in order to model data exchange with other threads, but perform no real
4291 // memory accesses.
4292 Info.memVT = MVT::i1;
4293
4294 // Our result depends on both our and other thread's arguments.
4296 Infos.push_back(Info);
4297 return;
4298 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4299 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4300 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4301 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4302 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4303 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4304 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4305 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4306 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4307 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4308 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4309 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4310 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4311 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4312 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4313 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4314 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4315 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4316 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4317 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4318 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4319 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4320 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4321 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4322 Info.opc = ISD::INTRINSIC_W_CHAIN;
4323 Info.memVT = MVT::v8f16;
4324 Info.ptrVal = I.getArgOperand(0);
4325 Info.offset = 0;
4326 Info.flags = MachineMemOperand::MOLoad;
4327 Info.align = Align(16);
4328 Infos.push_back(Info);
4329 return;
4330 }
4331 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4332 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4333 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4334 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4335 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4336 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4337 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4338 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4339 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4340 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4341 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4342 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4343 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4344 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4345 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4346 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4347 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4348 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4349 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4350 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4351 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4352 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4353 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4354 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4355 Info.opc = ISD::INTRINSIC_W_CHAIN;
4356 Info.memVT = MVT::v2i32;
4357 Info.ptrVal = I.getArgOperand(0);
4358 Info.offset = 0;
4359 Info.flags = MachineMemOperand::MOLoad;
4360 Info.align = Align(8);
4361 Infos.push_back(Info);
4362 return;
4363 }
4364
4365 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4366 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4367 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4368 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4369 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4370 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4371 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4372 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4373 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4374 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4375 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4376 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4377 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4378 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4379 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4380 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4381
4382 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4383 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4384 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4385 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4386 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4387 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4388 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4389 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4390 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4391 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4392 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4393 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4394 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4395 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4396 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4397 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4398 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4399 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
4400 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
4401 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
4402 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
4403 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
4404 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
4405 Info.opc = ISD::INTRINSIC_W_CHAIN;
4406 Info.memVT = MVT::v4i32;
4407 Info.ptrVal = I.getArgOperand(0);
4408 Info.offset = 0;
4409 Info.flags = MachineMemOperand::MOLoad;
4410 Info.align = Align(16);
4411 Infos.push_back(Info);
4412 return;
4413 }
4414
4415 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4416 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4417 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4418 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4419 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4420 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4421 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4422 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4423
4424 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4425 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4426 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4427 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4428 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4429 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4430 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4431 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4432 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4433 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4434 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4435 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4436 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4437 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4438 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4439 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4440 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4441 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4442 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4443 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4444 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4445 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4446 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4447 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4448 Info.opc = ISD::INTRINSIC_W_CHAIN;
4449 Info.memVT = MVT::i32;
4450 Info.ptrVal = I.getArgOperand(0);
4451 Info.offset = 0;
4452 Info.flags = MachineMemOperand::MOLoad;
4453 Info.align = Align(4);
4454 Infos.push_back(Info);
4455 return;
4456 }
4457
4458 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4459 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4460 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4461 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4462 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4463 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4464 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4465 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4466 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4467 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4468 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4469 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4470 Info.opc = ISD::INTRINSIC_W_CHAIN;
4471 Info.memVT = MVT::v4f16;
4472 Info.ptrVal = I.getArgOperand(0);
4473 Info.offset = 0;
4474 Info.flags = MachineMemOperand::MOLoad;
4475 Info.align = Align(16);
4476 Infos.push_back(Info);
4477 return;
4478 }
4479
4480 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4481 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4482 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4483 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4484 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4485 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4486 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4487 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4488 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4489 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4490 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4491 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4492 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4493 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4494 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4495 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4496 Info.opc = ISD::INTRINSIC_W_CHAIN;
4497 Info.memVT = MVT::v8f32;
4498 Info.ptrVal = I.getArgOperand(0);
4499 Info.offset = 0;
4500 Info.flags = MachineMemOperand::MOLoad;
4501 Info.align = Align(16);
4502 Infos.push_back(Info);
4503 return;
4504 }
4505
4506 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4507 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4508 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4509 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4510
4511 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4512 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4513 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4514 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4515
4516 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4517 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4518 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4519 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4520 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4521 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4522 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4523 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4524 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4525 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4526 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4527 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4528 Info.opc = ISD::INTRINSIC_W_CHAIN;
4529 Info.memVT = MVT::v8i32;
4530 Info.ptrVal = I.getArgOperand(0);
4531 Info.offset = 0;
4532 Info.flags = MachineMemOperand::MOLoad;
4533 Info.align = Align(16);
4534 Infos.push_back(Info);
4535 return;
4536 }
4537
4538 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4539 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4540 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4541 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4542 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4543 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4544 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4545 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4546 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4547 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4548 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4549 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4550 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4551 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4552 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4553 Info.opc = ISD::INTRINSIC_W_CHAIN;
4554 Info.memVT = MVT::v2i32;
4555 Info.ptrVal = I.getArgOperand(0);
4556 Info.offset = 0;
4557 Info.flags = MachineMemOperand::MOLoad;
4558 Info.align = Align(8);
4559 Infos.push_back(Info);
4560 return;
4561 }
4562
4563 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4564 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4565 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4566 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4567
4568 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4569 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4570 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4571 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4572 Info.opc = ISD::INTRINSIC_W_CHAIN;
4573 Info.memVT = MVT::f64;
4574 Info.ptrVal = I.getArgOperand(0);
4575 Info.offset = 0;
4576 Info.flags = MachineMemOperand::MOLoad;
4577 Info.align = Align(8);
4578 Infos.push_back(Info);
4579 return;
4580 }
4581
4582 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4583 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4584 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4585 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4586 Info.opc = ISD::INTRINSIC_W_CHAIN;
4587 Info.memVT = MVT::v2f64;
4588 Info.ptrVal = I.getArgOperand(0);
4589 Info.offset = 0;
4590 Info.flags = MachineMemOperand::MOLoad;
4591 Info.align = Align(16);
4592 Infos.push_back(Info);
4593 return;
4594 }
4595
4596 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4597 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4598 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4599 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4600 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4601 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4602 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4603 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4604 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4605 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4606 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4607 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4608 Info.opc = ISD::INTRINSIC_VOID;
4609 Info.memVT = MVT::v4f16;
4610 Info.ptrVal = I.getArgOperand(0);
4611 Info.offset = 0;
4612 Info.flags = MachineMemOperand::MOStore;
4613 Info.align = Align(16);
4614 Infos.push_back(Info);
4615 return;
4616 }
4617
4618 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4619 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4620 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4621 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4622 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4623 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4624 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4625 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4626 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4627 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4628 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4629 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4630 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4631 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4632 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4633 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4634 Info.opc = ISD::INTRINSIC_VOID;
4635 Info.memVT = MVT::v8f32;
4636 Info.ptrVal = I.getArgOperand(0);
4637 Info.offset = 0;
4638 Info.flags = MachineMemOperand::MOStore;
4639 Info.align = Align(16);
4640 Infos.push_back(Info);
4641 return;
4642 }
4643
4644 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4645 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4646 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4647 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4648 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4649 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4650 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4651 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4652 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4653 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4654 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4655 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4656 Info.opc = ISD::INTRINSIC_VOID;
4657 Info.memVT = MVT::v8i32;
4658 Info.ptrVal = I.getArgOperand(0);
4659 Info.offset = 0;
4660 Info.flags = MachineMemOperand::MOStore;
4661 Info.align = Align(16);
4662 Infos.push_back(Info);
4663 return;
4664 }
4665
4666 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4667 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4668 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4669 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4670 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4671 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4672 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4673 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4674 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4675 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4676 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4677 Info.opc = ISD::INTRINSIC_VOID;
4678 Info.memVT = MVT::v2i32;
4679 Info.ptrVal = I.getArgOperand(0);
4680 Info.offset = 0;
4681 Info.flags = MachineMemOperand::MOStore;
4682 Info.align = Align(8);
4683 Infos.push_back(Info);
4684 return;
4685 }
4686
4687 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4688 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4689 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4690 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4691 Info.opc = ISD::INTRINSIC_VOID;
4692 Info.memVT = MVT::v2f64;
4693 Info.ptrVal = I.getArgOperand(0);
4694 Info.offset = 0;
4695 Info.flags = MachineMemOperand::MOStore;
4696 Info.align = Align(16);
4697 Infos.push_back(Info);
4698 return;
4699 }
4700
4701 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4702 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4703 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4704 Info.opc = ISD::INTRINSIC_VOID;
4705 Info.memVT = MVT::i32;
4706 Info.ptrVal = I.getArgOperand(0);
4707 Info.offset = 0;
4708 Info.flags = MachineMemOperand::MOStore;
4709 Info.align = Align(4);
4710 Infos.push_back(Info);
4711 return;
4712 }
4713
4714 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4715 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4716 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4717 Info.opc = ISD::INTRINSIC_VOID;
4718 Info.memVT = MVT::v4i32;
4719 Info.ptrVal = I.getArgOperand(0);
4720 Info.offset = 0;
4721 Info.flags = MachineMemOperand::MOStore;
4722 Info.align = Align(16);
4723 Infos.push_back(Info);
4724 return;
4725 }
4726
4727 case Intrinsic::nvvm_prefetch_tensormap: {
4728 auto &DL = I.getDataLayout();
4729 Info.opc = ISD::INTRINSIC_VOID;
4730 Info.memVT = getPointerTy(DL);
4731 Info.ptrVal = I.getArgOperand(0);
4732 Info.offset = 0;
4733 Info.flags =
4735 Info.align.reset();
4736 Infos.push_back(Info);
4737 return;
4738 }
4739
4740 case Intrinsic::nvvm_tensormap_replace_global_address:
4741 case Intrinsic::nvvm_tensormap_replace_global_stride: {
4742 Info.opc = ISD::INTRINSIC_VOID;
4743 Info.memVT = MVT::i64;
4744 Info.ptrVal = I.getArgOperand(0);
4745 Info.offset = 0;
4746 Info.flags = MachineMemOperand::MOStore;
4747 Info.align.reset();
4748 Infos.push_back(Info);
4749 return;
4750 }
4751
4752 case Intrinsic::nvvm_tensormap_replace_rank:
4753 case Intrinsic::nvvm_tensormap_replace_box_dim:
4754 case Intrinsic::nvvm_tensormap_replace_global_dim:
4755 case Intrinsic::nvvm_tensormap_replace_element_stride:
4756 case Intrinsic::nvvm_tensormap_replace_elemtype:
4757 case Intrinsic::nvvm_tensormap_replace_interleave_layout:
4758 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
4759 case Intrinsic::nvvm_tensormap_replace_swizzle_atomicity:
4760 case Intrinsic::nvvm_tensormap_replace_fill_mode: {
4761 Info.opc = ISD::INTRINSIC_VOID;
4762 Info.memVT = MVT::i32;
4763 Info.ptrVal = I.getArgOperand(0);
4764 Info.offset = 0;
4765 Info.flags = MachineMemOperand::MOStore;
4766 Info.align.reset();
4767 Infos.push_back(Info);
4768 return;
4769 }
4770
4771 case Intrinsic::nvvm_ldu_global_i:
4772 case Intrinsic::nvvm_ldu_global_f:
4773 case Intrinsic::nvvm_ldu_global_p: {
4774 Info.opc = ISD::INTRINSIC_W_CHAIN;
4775 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4776 Info.ptrVal = I.getArgOperand(0);
4777 Info.offset = 0;
4778 Info.flags = MachineMemOperand::MOLoad;
4779 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4780
4781 Infos.push_back(Info);
4782 return;
4783 }
4784 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4785 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4786 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4787 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4788 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4789 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4790 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4791 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4792 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4793 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4794 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4795 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4796 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4797 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4798 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4799 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4800 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4801 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4802 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4803 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4804 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4805 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4806 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4807 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4808 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4809 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4810 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4811 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4812 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4813 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4814 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4815 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4816 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4817 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4818 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4819 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4820 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4821 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4822 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4823 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4824 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4825 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4826 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4827 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4828 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4829 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4830 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4831 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4832 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4833 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4834 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4835 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4836 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4837 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4838 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4839 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4840 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4841 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4842 Info.opc = ISD::INTRINSIC_W_CHAIN;
4843 Info.memVT = MVT::v4f32;
4844 Info.ptrVal = nullptr;
4845 Info.offset = 0;
4846 Info.flags = MachineMemOperand::MOLoad;
4847 Info.align = Align(16);
4848 Infos.push_back(Info);
4849 return;
4850
4851 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4852 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4853 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4854 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4855 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4856 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4857 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4858 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4859 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4860 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4861 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4862 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4863 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4864 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4865 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4866 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4867 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4868 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4869 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4870 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4871 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4872 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4873 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4874 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4875 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4876 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4877 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4878 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4879 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4880 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4881 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4882 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4883 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4884 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4885 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4886 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4887 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4888 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4889 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4890 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4891 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4892 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4893 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4894 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4895 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4896 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4897 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4898 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4899 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4900 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4901 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4902 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4903 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4904 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4905 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4906 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4907 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4908 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4909 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4910 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4911 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4912 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4913 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4914 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4915 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4916 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4917 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4918 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4919 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4920 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4921 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4922 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4923 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4924 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4925 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4926 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4927 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4928 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4929 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4930 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4931 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4932 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4933 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4934 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4935 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4936 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4937 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4938 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4939 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4940 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4941 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4942 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4943 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4944 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4945 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4946 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4947 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4948 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4949 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4950 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4951 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4952 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4953 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4954 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4955 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4956 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4957 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4958 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4959 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4960 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4961 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4962 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4963 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4964 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4965 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4966 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4967 Info.opc = ISD::INTRINSIC_W_CHAIN;
4968 Info.memVT = MVT::v4i32;
4969 Info.ptrVal = nullptr;
4970 Info.offset = 0;
4971 Info.flags = MachineMemOperand::MOLoad;
4972 Info.align = Align(16);
4973 Infos.push_back(Info);
4974 return;
4975
4976 case Intrinsic::nvvm_suld_1d_i8_clamp:
4977 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4978 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4979 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4980 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4981 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4982 case Intrinsic::nvvm_suld_2d_i8_clamp:
4983 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4984 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4985 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4986 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4987 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4988 case Intrinsic::nvvm_suld_3d_i8_clamp:
4989 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4990 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4991 case Intrinsic::nvvm_suld_1d_i8_trap:
4992 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4993 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4994 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4995 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4996 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4997 case Intrinsic::nvvm_suld_2d_i8_trap:
4998 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4999 case Intrinsic::nvvm_suld_2d_v4i8_trap:
5000 case Intrinsic::nvvm_suld_2d_array_i8_trap:
5001 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
5002 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
5003 case Intrinsic::nvvm_suld_3d_i8_trap:
5004 case Intrinsic::nvvm_suld_3d_v2i8_trap:
5005 case Intrinsic::nvvm_suld_3d_v4i8_trap:
5006 case Intrinsic::nvvm_suld_1d_i8_zero:
5007 case Intrinsic::nvvm_suld_1d_v2i8_zero:
5008 case Intrinsic::nvvm_suld_1d_v4i8_zero:
5009 case Intrinsic::nvvm_suld_1d_array_i8_zero:
5010 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
5011 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
5012 case Intrinsic::nvvm_suld_2d_i8_zero:
5013 case Intrinsic::nvvm_suld_2d_v2i8_zero:
5014 case Intrinsic::nvvm_suld_2d_v4i8_zero:
5015 case Intrinsic::nvvm_suld_2d_array_i8_zero:
5016 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
5017 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
5018 case Intrinsic::nvvm_suld_3d_i8_zero:
5019 case Intrinsic::nvvm_suld_3d_v2i8_zero:
5020 case Intrinsic::nvvm_suld_3d_v4i8_zero:
5021 Info.opc = ISD::INTRINSIC_W_CHAIN;
5022 Info.memVT = MVT::i8;
5023 Info.ptrVal = nullptr;
5024 Info.offset = 0;
5025 Info.flags = MachineMemOperand::MOLoad;
5026 Info.align = Align(16);
5027 Infos.push_back(Info);
5028 return;
5029
5030 case Intrinsic::nvvm_suld_1d_i16_clamp:
5031 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
5032 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
5033 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
5034 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
5035 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
5036 case Intrinsic::nvvm_suld_2d_i16_clamp:
5037 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
5038 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
5039 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
5040 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
5041 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
5042 case Intrinsic::nvvm_suld_3d_i16_clamp:
5043 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
5044 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
5045 case Intrinsic::nvvm_suld_1d_i16_trap:
5046 case Intrinsic::nvvm_suld_1d_v2i16_trap:
5047 case Intrinsic::nvvm_suld_1d_v4i16_trap:
5048 case Intrinsic::nvvm_suld_1d_array_i16_trap:
5049 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
5050 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
5051 case Intrinsic::nvvm_suld_2d_i16_trap:
5052 case Intrinsic::nvvm_suld_2d_v2i16_trap:
5053 case Intrinsic::nvvm_suld_2d_v4i16_trap:
5054 case Intrinsic::nvvm_suld_2d_array_i16_trap:
5055 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
5056 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
5057 case Intrinsic::nvvm_suld_3d_i16_trap:
5058 case Intrinsic::nvvm_suld_3d_v2i16_trap:
5059 case Intrinsic::nvvm_suld_3d_v4i16_trap:
5060 case Intrinsic::nvvm_suld_1d_i16_zero:
5061 case Intrinsic::nvvm_suld_1d_v2i16_zero:
5062 case Intrinsic::nvvm_suld_1d_v4i16_zero:
5063 case Intrinsic::nvvm_suld_1d_array_i16_zero:
5064 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
5065 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
5066 case Intrinsic::nvvm_suld_2d_i16_zero:
5067 case Intrinsic::nvvm_suld_2d_v2i16_zero:
5068 case Intrinsic::nvvm_suld_2d_v4i16_zero:
5069 case Intrinsic::nvvm_suld_2d_array_i16_zero:
5070 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
5071 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
5072 case Intrinsic::nvvm_suld_3d_i16_zero:
5073 case Intrinsic::nvvm_suld_3d_v2i16_zero:
5074 case Intrinsic::nvvm_suld_3d_v4i16_zero:
5075 Info.opc = ISD::INTRINSIC_W_CHAIN;
5076 Info.memVT = MVT::i16;
5077 Info.ptrVal = nullptr;
5078 Info.offset = 0;
5079 Info.flags = MachineMemOperand::MOLoad;
5080 Info.align = Align(16);
5081 Infos.push_back(Info);
5082 return;
5083
5084 case Intrinsic::nvvm_suld_1d_i32_clamp:
5085 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
5086 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
5087 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
5088 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
5089 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
5090 case Intrinsic::nvvm_suld_2d_i32_clamp:
5091 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5092 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5093 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5094 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5095 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5096 case Intrinsic::nvvm_suld_3d_i32_clamp:
5097 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5098 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5099 case Intrinsic::nvvm_suld_1d_i32_trap:
5100 case Intrinsic::nvvm_suld_1d_v2i32_trap:
5101 case Intrinsic::nvvm_suld_1d_v4i32_trap:
5102 case Intrinsic::nvvm_suld_1d_array_i32_trap:
5103 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5104 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5105 case Intrinsic::nvvm_suld_2d_i32_trap:
5106 case Intrinsic::nvvm_suld_2d_v2i32_trap:
5107 case Intrinsic::nvvm_suld_2d_v4i32_trap:
5108 case Intrinsic::nvvm_suld_2d_array_i32_trap:
5109 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5110 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5111 case Intrinsic::nvvm_suld_3d_i32_trap:
5112 case Intrinsic::nvvm_suld_3d_v2i32_trap:
5113 case Intrinsic::nvvm_suld_3d_v4i32_trap:
5114 case Intrinsic::nvvm_suld_1d_i32_zero:
5115 case Intrinsic::nvvm_suld_1d_v2i32_zero:
5116 case Intrinsic::nvvm_suld_1d_v4i32_zero:
5117 case Intrinsic::nvvm_suld_1d_array_i32_zero:
5118 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5119 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5120 case Intrinsic::nvvm_suld_2d_i32_zero:
5121 case Intrinsic::nvvm_suld_2d_v2i32_zero:
5122 case Intrinsic::nvvm_suld_2d_v4i32_zero:
5123 case Intrinsic::nvvm_suld_2d_array_i32_zero:
5124 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5125 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5126 case Intrinsic::nvvm_suld_3d_i32_zero:
5127 case Intrinsic::nvvm_suld_3d_v2i32_zero:
5128 case Intrinsic::nvvm_suld_3d_v4i32_zero:
5129 Info.opc = ISD::INTRINSIC_W_CHAIN;
5130 Info.memVT = MVT::i32;
5131 Info.ptrVal = nullptr;
5132 Info.offset = 0;
5133 Info.flags = MachineMemOperand::MOLoad;
5134 Info.align = Align(16);
5135 Infos.push_back(Info);
5136 return;
5137
5138 case Intrinsic::nvvm_suld_1d_i64_clamp:
5139 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5140 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5141 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5142 case Intrinsic::nvvm_suld_2d_i64_clamp:
5143 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5144 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5145 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5146 case Intrinsic::nvvm_suld_3d_i64_clamp:
5147 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5148 case Intrinsic::nvvm_suld_1d_i64_trap:
5149 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5150 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5151 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5152 case Intrinsic::nvvm_suld_2d_i64_trap:
5153 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5154 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5155 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5156 case Intrinsic::nvvm_suld_3d_i64_trap:
5157 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5158 case Intrinsic::nvvm_suld_1d_i64_zero:
5159 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5160 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5161 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5162 case Intrinsic::nvvm_suld_2d_i64_zero:
5163 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5164 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5165 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5166 case Intrinsic::nvvm_suld_3d_i64_zero:
5167 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5168 Info.opc = ISD::INTRINSIC_W_CHAIN;
5169 Info.memVT = MVT::i64;
5170 Info.ptrVal = nullptr;
5171 Info.offset = 0;
5172 Info.flags = MachineMemOperand::MOLoad;
5173 Info.align = Align(16);
5174 Infos.push_back(Info);
5175 return;
5176
5177 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
5178 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
5179 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
5180 Info.opc = ISD::INTRINSIC_W_CHAIN;
5181 Info.memVT = MVT::v1i32;
5182 Info.ptrVal = I.getArgOperand(0);
5183 Info.offset = 0;
5184 Info.flags = MachineMemOperand::MOLoad;
5185 Info.align.reset();
5186 Infos.push_back(Info);
5187 return;
5188 }
5189
5190 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
5191 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
5192 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
5193 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
5194 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
5195 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32: {
5196 Info.opc = ISD::INTRINSIC_W_CHAIN;
5197 Info.memVT = MVT::v2i32;
5198 Info.ptrVal = I.getArgOperand(0);
5199 Info.offset = 0;
5200 Info.flags = MachineMemOperand::MOLoad;
5201 Info.align.reset();
5202 Infos.push_back(Info);
5203 return;
5204 }
5205
5206 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
5207 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32: {
5208 Info.opc = ISD::INTRINSIC_W_CHAIN;
5209 Info.memVT = MVT::v2f32;
5210 Info.ptrVal = I.getArgOperand(0);
5211 Info.offset = 0;
5212 Info.flags = MachineMemOperand::MOLoad;
5213 Info.align.reset();
5214 Infos.push_back(Info);
5215 return;
5216 }
5217
5218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
5219 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
5220 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
5221 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
5222 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
5223 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
5224 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32: {
5225 Info.opc = ISD::INTRINSIC_W_CHAIN;
5226 Info.memVT = MVT::v4i32;
5227 Info.ptrVal = I.getArgOperand(0);
5228 Info.offset = 0;
5229 Info.flags = MachineMemOperand::MOLoad;
5230 Info.align.reset();
5231 Infos.push_back(Info);
5232 return;
5233 }
5234
5235 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
5236 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32: {
5237 Info.opc = ISD::INTRINSIC_W_CHAIN;
5238 Info.memVT = MVT::v4f32;
5239 Info.ptrVal = I.getArgOperand(0);
5240 Info.offset = 0;
5241 Info.flags = MachineMemOperand::MOLoad;
5242 Info.align.reset();
5243 Infos.push_back(Info);
5244 return;
5245 }
5246
5247 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
5248 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
5249 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
5250 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
5251 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
5252 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
5253 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32: {
5254 Info.opc = ISD::INTRINSIC_W_CHAIN;
5255 Info.memVT = MVT::v8i32;
5256 Info.ptrVal = I.getArgOperand(0);
5257 Info.offset = 0;
5258 Info.flags = MachineMemOperand::MOLoad;
5259 Info.align.reset();
5260 Infos.push_back(Info);
5261 return;
5262 }
5263
5264 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
5265 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32: {
5266 Info.opc = ISD::INTRINSIC_W_CHAIN;
5267 Info.memVT = MVT::v8f32;
5268 Info.ptrVal = I.getArgOperand(0);
5269 Info.offset = 0;
5270 Info.flags = MachineMemOperand::MOLoad;
5271 Info.align.reset();
5272 Infos.push_back(Info);
5273 return;
5274 }
5275
5276 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
5277 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
5278 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
5279 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
5280 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
5281 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
5282 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32: {
5283 Info.opc = ISD::INTRINSIC_W_CHAIN;
5284 Info.memVT = MVT::v16i32;
5285 Info.ptrVal = I.getArgOperand(0);
5286 Info.offset = 0;
5287 Info.flags = MachineMemOperand::MOLoad;
5288 Info.align.reset();
5289 Infos.push_back(Info);
5290 return;
5291 }
5292
5293 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
5294 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32: {
5295 Info.opc = ISD::INTRINSIC_W_CHAIN;
5296 Info.memVT = MVT::v16f32;
5297 Info.ptrVal = I.getArgOperand(0);
5298 Info.offset = 0;
5299 Info.flags = MachineMemOperand::MOLoad;
5300 Info.align.reset();
5301 Infos.push_back(Info);
5302 return;
5303 }
5304
5305 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
5306 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
5307 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
5308 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
5309 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
5310 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
5311 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32: {
5312 Info.opc = ISD::INTRINSIC_W_CHAIN;
5313 Info.memVT = MVT::v32i32;
5314 Info.ptrVal = I.getArgOperand(0);
5315 Info.offset = 0;
5316 Info.flags = MachineMemOperand::MOLoad;
5317 Info.align.reset();
5318 Infos.push_back(Info);
5319 return;
5320 }
5321
5322 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
5323 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32: {
5324 Info.opc = ISD::INTRINSIC_W_CHAIN;
5325 Info.memVT = MVT::v32f32;
5326 Info.ptrVal = I.getArgOperand(0);
5327 Info.offset = 0;
5328 Info.flags = MachineMemOperand::MOLoad;
5329 Info.align.reset();
5330 Infos.push_back(Info);
5331 return;
5332 }
5333
5334 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
5335 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
5336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
5337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
5338 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
5339 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
5340 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32: {
5341 Info.opc = ISD::INTRINSIC_W_CHAIN;
5342 Info.memVT = MVT::v64i32;
5343 Info.ptrVal = I.getArgOperand(0);
5344 Info.offset = 0;
5345 Info.flags = MachineMemOperand::MOLoad;
5346 Info.align.reset();
5347 Infos.push_back(Info);
5348 return;
5349 }
5350
5351 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
5352 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32: {
5353 Info.opc = ISD::INTRINSIC_W_CHAIN;
5354 Info.memVT = MVT::v64f32;
5355 Info.ptrVal = I.getArgOperand(0);
5356 Info.offset = 0;
5357 Info.flags = MachineMemOperand::MOLoad;
5358 Info.align.reset();
5359 Infos.push_back(Info);
5360 return;
5361 }
5362
5363 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
5364 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
5365 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
5366 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
5367 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
5368 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
5369 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32: {
5370 Info.opc = ISD::INTRINSIC_W_CHAIN;
5371 Info.memVT = MVT::v128i32;
5372 Info.ptrVal = I.getArgOperand(0);
5373 Info.offset = 0;
5374 Info.flags = MachineMemOperand::MOLoad;
5375 Info.align.reset();
5376 Infos.push_back(Info);
5377 return;
5378 }
5379
5380 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
5381 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32: {
5382 Info.opc = ISD::INTRINSIC_W_CHAIN;
5383 Info.memVT = MVT::v128f32;
5384 Info.ptrVal = I.getArgOperand(0);
5385 Info.offset = 0;
5386 Info.flags = MachineMemOperand::MOLoad;
5387 Info.align.reset();
5388 Infos.push_back(Info);
5389 return;
5390 }
5391
5392 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
5393 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
5394 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
5395 Info.opc = ISD::INTRINSIC_VOID;
5396 Info.memVT = MVT::i32;
5397 Info.ptrVal = I.getArgOperand(0);
5398 Info.offset = 0;
5399 Info.flags = MachineMemOperand::MOStore;
5400 Info.align.reset();
5401 Infos.push_back(Info);
5402 return;
5403 }
5404
5405 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
5406 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
5407 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
5408 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
5409 Info.opc = ISD::INTRINSIC_VOID;
5410 Info.memVT = MVT::v2i32;
5411 Info.ptrVal = I.getArgOperand(0);
5412 Info.offset = 0;
5413 Info.flags = MachineMemOperand::MOStore;
5414 Info.align.reset();
5415 Infos.push_back(Info);
5416 return;
5417 }
5418
5419 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
5420 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
5421 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
5422 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
5423 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
5424 Info.opc = ISD::INTRINSIC_VOID;
5425 Info.memVT = MVT::v4i32;
5426 Info.ptrVal = I.getArgOperand(0);
5427 Info.offset = 0;
5428 Info.flags = MachineMemOperand::MOStore;
5429 Info.align.reset();
5430 Infos.push_back(Info);
5431 return;
5432 }
5433
5434 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
5435 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
5436 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
5437 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
5438 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
5439 Info.opc = ISD::INTRINSIC_VOID;
5440 Info.memVT = MVT::v8i32;
5441 Info.ptrVal = I.getArgOperand(0);
5442 Info.offset = 0;
5443 Info.flags = MachineMemOperand::MOStore;
5444 Info.align.reset();
5445 Infos.push_back(Info);
5446 return;
5447 }
5448
5449 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
5450 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
5451 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
5452 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
5453 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
5454 Info.opc = ISD::INTRINSIC_VOID;
5455 Info.memVT = MVT::v16i32;
5456 Info.ptrVal = I.getArgOperand(0);
5457 Info.offset = 0;
5458 Info.flags = MachineMemOperand::MOStore;
5459 Info.align.reset();
5460 Infos.push_back(Info);
5461 return;
5462 }
5463
5464 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
5465 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
5466 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
5467 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
5468 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
5469 Info.opc = ISD::INTRINSIC_VOID;
5470 Info.memVT = MVT::v32i32;
5471 Info.ptrVal = I.getArgOperand(0);
5472 Info.offset = 0;
5473 Info.flags = MachineMemOperand::MOStore;
5474 Info.align.reset();
5475 Infos.push_back(Info);
5476 return;
5477 }
5478
5479 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
5480 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
5481 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
5482 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
5483 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
5484 Info.opc = ISD::INTRINSIC_VOID;
5485 Info.memVT = MVT::v64i32;
5486 Info.ptrVal = I.getArgOperand(0);
5487 Info.offset = 0;
5488 Info.flags = MachineMemOperand::MOStore;
5489 Info.align.reset();
5490 Infos.push_back(Info);
5491 return;
5492 }
5493
5494 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
5495 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
5496 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
5497 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
5498 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
5499 Info.opc = ISD::INTRINSIC_VOID;
5500 Info.memVT = MVT::v128i32;
5501 Info.ptrVal = I.getArgOperand(0);
5502 Info.offset = 0;
5503 Info.flags = MachineMemOperand::MOStore;
5504 Info.align.reset();
5505 Infos.push_back(Info);
5506 return;
5507 }
5508 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
5509 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
5510 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
5511 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
5512 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
5513 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
5514 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
5515 case Intrinsic::
5516 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
5517 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
5518 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
5519 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
5520 case Intrinsic::
5521 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
5522 // We are reading and writing back to TMem
5523 Info.opc = ISD::INTRINSIC_VOID;
5524 Info.memVT = MVT::v4i32;
5525 Info.ptrVal = I.getArgOperand(0);
5526 Info.offset = 0;
5528 Info.align = Align(16);
5529 Infos.push_back(Info);
5530 return;
5531 }
5532
5533 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
5534 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
5535 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
5536 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
5537 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
5538 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
5539 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
5540 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
5541 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
5542 case Intrinsic::
5543 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
5544 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
5545 case Intrinsic::
5546 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
5547 // We are reading and writing back to TMem
5548 Info.opc = ISD::INTRINSIC_VOID;
5549 Info.memVT = MVT::v8i32;
5550 Info.ptrVal = I.getArgOperand(0);
5551 Info.offset = 0;
5553 Info.align = Align(16);
5554 Infos.push_back(Info);
5555 return;
5556 }
5557 }
5558}
5559
5560// Helper for getting a function parameter name. Name is composed from
5561// its index and the function name. Negative index corresponds to special
5562// parameter (unsized array) used for passing variable arguments.
5564 int Idx) const {
5565 std::string ParamName;
5566 raw_string_ostream ParamStr(ParamName);
5567
5568 ParamStr << getTargetMachine().getSymbol(F)->getName();
5569 if (Idx < 0)
5570 ParamStr << "_vararg";
5571 else
5572 ParamStr << "_param_" << Idx;
5573
5574 return ParamName;
5575}
5576
5577/// isLegalAddressingMode - Return true if the addressing mode represented
5578/// by AM is legal for this target, for a load/store of the specified type.
5579/// Used to guide target specific optimizations, like loop strength reduction
5580/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5581/// (CodeGenPrepare.cpp)
5583 const AddrMode &AM, Type *Ty,
5584 unsigned AS, Instruction *I) const {
5585 // AddrMode - This represents an addressing mode of:
5586 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5587 //
5588 // The legal address modes are
5589 // - [avar]
5590 // - [areg]
5591 // - [areg+immoff]
5592 // - [immAddr]
5593
5594 // immoff must fit in a signed 32-bit int
5595 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5596 return false;
5597
5598 if (AM.BaseGV)
5599 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5600
5601 switch (AM.Scale) {
5602 case 0: // "r", "r+i" or "i" is allowed
5603 break;
5604 case 1:
5605 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5606 return false;
5607 // Otherwise we have r+i.
5608 break;
5609 default:
5610 // No scale > 1 is allowed
5611 return false;
5612 }
5613 return true;
5614}
5615
5616//===----------------------------------------------------------------------===//
5617// NVPTX Inline Assembly Support
5618//===----------------------------------------------------------------------===//
5619
5620/// getConstraintType - Given a constraint letter, return the type of
5621/// constraint it is for this target.
5624 if (Constraint.size() == 1) {
5625 switch (Constraint[0]) {
5626 default:
5627 break;
5628 case 'b':
5629 case 'r':
5630 case 'h':
5631 case 'c':
5632 case 'l':
5633 case 'f':
5634 case 'd':
5635 case 'q':
5636 case '0':
5637 case 'N':
5638 return C_RegisterClass;
5639 }
5640 }
5641 return TargetLowering::getConstraintType(Constraint);
5642}
5643
5644std::pair<unsigned, const TargetRegisterClass *>
5646 StringRef Constraint,
5647 MVT VT) const {
5648 if (Constraint.size() == 1) {
5649 switch (Constraint[0]) {
5650 case 'b':
5651 return std::make_pair(0U, &NVPTX::B1RegClass);
5652 case 'c':
5653 case 'h':
5654 return std::make_pair(0U, &NVPTX::B16RegClass);
5655 case 'r':
5656 case 'f':
5657 return std::make_pair(0U, &NVPTX::B32RegClass);
5658 case 'l':
5659 case 'N':
5660 case 'd':
5661 return std::make_pair(0U, &NVPTX::B64RegClass);
5662 case 'q': {
5663 if (STI.getSmVersion() < 70)
5664 report_fatal_error("Inline asm with 128 bit operands is only "
5665 "supported for sm_70 and higher!");
5666 return std::make_pair(0U, &NVPTX::B128RegClass);
5667 }
5668 }
5669 }
5670 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5671}
5672
5673//===----------------------------------------------------------------------===//
5674// NVPTX DAG Combining
5675//===----------------------------------------------------------------------===//
5676
5678 CodeGenOptLevel OptLevel) const {
5679 // Always honor command-line argument
5680 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5681 return FMAContractLevelOpt > 0;
5682
5683 // Do not contract if we're not optimizing the code.
5684 if (OptLevel == CodeGenOptLevel::None)
5685 return false;
5686
5687 // Honor TargetOptions flags that explicitly say fusion is okay.
5689 return true;
5690
5691 return false;
5692}
5693
5694static bool isConstZero(const SDValue &Operand) {
5695 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5696 return Const && Const->getZExtValue() == 0;
5697}
5698
5699/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5700/// operands N0 and N1. This is a helper for PerformADDCombine that is
5701/// called with the default operands, and if that fails, with commuted
5702/// operands.
5703static SDValue
5706 EVT VT = N0.getValueType();
5707
5708 // Since integer multiply-add costs the same as integer multiply
5709 // but is more costly than integer add, do the fusion only when
5710 // the mul is only used in the add.
5711 // TODO: this may not be true for later architectures, consider relaxing this
5712 if (!N0.getNode()->hasOneUse())
5713 return SDValue();
5714
5715 // fold (add (select cond, 0, (mul a, b)), c)
5716 // -> (select cond, c, (add (mul a, b), c))
5717 //
5718 if (N0.getOpcode() == ISD::SELECT) {
5719 unsigned ZeroOpNum;
5720 if (isConstZero(N0->getOperand(1)))
5721 ZeroOpNum = 1;
5722 else if (isConstZero(N0->getOperand(2)))
5723 ZeroOpNum = 2;
5724 else
5725 return SDValue();
5726
5727 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5728 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5729 return SDValue();
5730
5731 SDLoc DL(N);
5732 SDValue Mul =
5733 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5734 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5735 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5736 ((ZeroOpNum == 1) ? N1 : MAD),
5737 ((ZeroOpNum == 1) ? MAD : N1));
5738 }
5739
5740 return SDValue();
5741}
5742
5743SDValue NVPTXTargetLowering::performFADDCombineWithOperands(
5745 CodeGenOptLevel OptLevel) const {
5746 EVT VT = N0.getValueType();
5747 if (N0.getOpcode() == ISD::FMUL) {
5748 if (!(allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5749 (N->getFlags().hasAllowContract() &&
5750 N0->getFlags().hasAllowContract())))
5751 return SDValue();
5752
5753 // For floating point:
5754 // Do the fusion only when the mul has less than 5 uses and all
5755 // are add.
5756 // The heuristic is that if a use is not an add, then that use
5757 // cannot be fused into fma, therefore mul is still needed anyway.
5758 // If there are more than 4 uses, even if they are all add, fusing
5759 // them will increase register pressue.
5760 //
5761 int numUses = 0;
5762 int nonAddCount = 0;
5763 for (const SDNode *User : N0.getNode()->users()) {
5764 numUses++;
5765 if (User->getOpcode() != ISD::FADD)
5766 ++nonAddCount;
5767 if (numUses >= 5)
5768 return SDValue();
5769 }
5770 if (nonAddCount) {
5771 int orderNo = N->getIROrder();
5772 int orderNo2 = N0.getNode()->getIROrder();
5773 // simple heuristics here for considering potential register
5774 // pressure, the logics here is that the differnce are used
5775 // to measure the distance between def and use, the longer distance
5776 // more likely cause register pressure.
5777 if (orderNo - orderNo2 < 500)
5778 return SDValue();
5779
5780 // Now, check if at least one of the FMUL's operands is live beyond the
5781 // node N, which guarantees that the FMA will not increase register
5782 // pressure at node N.
5783 bool opIsLive = false;
5784 const SDNode *left = N0.getOperand(0).getNode();
5785 const SDNode *right = N0.getOperand(1).getNode();
5786
5787 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5788 opIsLive = true;
5789
5790 if (!opIsLive)
5791 for (const SDNode *User : left->users()) {
5792 int orderNo3 = User->getIROrder();
5793 if (orderNo3 > orderNo) {
5794 opIsLive = true;
5795 break;
5796 }
5797 }
5798
5799 if (!opIsLive)
5800 for (const SDNode *User : right->users()) {
5801 int orderNo3 = User->getIROrder();
5802 if (orderNo3 > orderNo) {
5803 opIsLive = true;
5804 break;
5805 }
5806 }
5807
5808 if (!opIsLive)
5809 return SDValue();
5810 }
5811
5812 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5813 N0.getOperand(1), N1);
5814 }
5815
5816 return SDValue();
5817}
5818
5819/// Fold unpacking movs into a load by increasing the number of return values.
5820///
5821/// ex:
5822/// L: v2f16,ch = load <p>
5823/// a: f16 = extractelt L:0, 0
5824/// b: f16 = extractelt L:0, 1
5825/// use(a, b)
5826///
5827/// ...is turned into...
5828///
5829/// L: f16,f16,ch = LoadV2 <p>
5830/// use(L:0, L:1)
5831static SDValue
5833 // Don't run this optimization before the legalizer
5834 if (!DCI.isAfterLegalizeDAG())
5835 return SDValue();
5836
5837 EVT ElementVT = N->getValueType(0);
5838 // Avoid non-packed types and v4i8
5839 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5840 return SDValue();
5841
5842 // Check whether all outputs are either used by an extractelt or are
5843 // glue/chain nodes
5844 if (!all_of(N->uses(), [&](SDUse &U) {
5845 // Skip glue, chain nodes
5846 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5847 return true;
5848 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5849 if (N->getOpcode() != ISD::LOAD)
5850 return true;
5851 // Since this is an ISD::LOAD, check all extractelts are used. If
5852 // any are not used, we don't want to defeat another optimization that
5853 // will narrow the load.
5854 //
5855 // For example:
5856 //
5857 // L: v2f16,ch = load <p>
5858 // e0: f16 = extractelt L:0, 0
5859 // e1: f16 = extractelt L:0, 1 <-- unused
5860 // store e0
5861 //
5862 // Can be optimized by DAGCombiner to:
5863 //
5864 // L: f16,ch = load <p>
5865 // store L:0
5866 return !U.getUser()->use_empty();
5867 }
5868
5869 // Otherwise, this use prevents us from splitting a value.
5870 return false;
5871 }))
5872 return SDValue();
5873
5874 auto *LD = cast<MemSDNode>(N);
5875 SDLoc DL(LD);
5876
5877 // the new opcode after we double the number of operands
5878 unsigned Opcode;
5879 SmallVector<SDValue> Operands(LD->ops());
5880 unsigned OldNumOutputs; // non-glue, non-chain outputs
5881 switch (LD->getOpcode()) {
5882 case ISD::LOAD:
5883 OldNumOutputs = 1;
5884 // Any packed type is legal, so the legalizer will not have lowered
5885 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5886 // here.
5887 Opcode = NVPTXISD::LoadV2;
5888 // append a "full" used bytes mask operand right before the extension type
5889 // operand, signifying that all bytes are used.
5890 Operands.push_back(DCI.DAG.getConstant(UINT32_MAX, DL, MVT::i32));
5891 Operands.push_back(DCI.DAG.getIntPtrConstant(
5892 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5893 break;
5894 case NVPTXISD::LoadV2:
5895 OldNumOutputs = 2;
5896 Opcode = NVPTXISD::LoadV4;
5897 break;
5898 case NVPTXISD::LoadV4:
5899 // V8 is only supported for f32/i32. Don't forget, we're not changing the
5900 // load size here. This is already a 256-bit load.
5901 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
5902 return SDValue();
5903 OldNumOutputs = 4;
5904 Opcode = NVPTXISD::LoadV8;
5905 break;
5906 case NVPTXISD::LoadV8:
5907 // PTX doesn't support the next doubling of outputs
5908 return SDValue();
5909 }
5910
5911 // the non-glue, non-chain outputs in the new load
5912 const unsigned NewNumOutputs = OldNumOutputs * 2;
5913 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5914 // add remaining chain and glue values
5915 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5916
5917 // Create the new load
5918 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5919 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5920 LD->getMemOperand());
5921
5922 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5923 // the outputs the same. These nodes will be optimized away in later
5924 // DAGCombiner iterations.
5926 for (unsigned I : seq(OldNumOutputs))
5927 Results.push_back(DCI.DAG.getBuildVector(
5928 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5929 // Add remaining chain and glue nodes
5930 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5931 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5932
5933 return DCI.DAG.getMergeValues(Results, DL);
5934}
5935
5936/// Fold packing movs into a store.
5937///
5938/// ex:
5939/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5940/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5941/// StoreV2 v1, v2
5942///
5943/// ...is turned into...
5944///
5945/// StoreV4 a, b, c, d
5948 unsigned Front, unsigned Back) {
5949 // We want to run this as late as possible since other optimizations may
5950 // eliminate the BUILD_VECTORs.
5951 if (!DCI.isAfterLegalizeDAG())
5952 return SDValue();
5953
5954 // Get the type of the operands being stored.
5955 EVT ElementVT = N->getOperand(Front).getValueType();
5956
5957 // Avoid non-packed types and v4i8
5958 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5959 return SDValue();
5960
5961 auto *ST = cast<MemSDNode>(N);
5962
5963 // The new opcode after we double the number of operands.
5964 unsigned Opcode;
5965 switch (N->getOpcode()) {
5966 case ISD::STORE:
5967 // Any packed type is legal, so the legalizer will not have lowered
5968 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5969 // it here.
5970 Opcode = NVPTXISD::StoreV2;
5971 break;
5972 case NVPTXISD::StoreV2:
5973 Opcode = NVPTXISD::StoreV4;
5974 break;
5975 case NVPTXISD::StoreV4:
5976 // V8 is only supported for f32/i32. Don't forget, we're not changing the
5977 // store size here. This is already a 256-bit store.
5978 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
5979 return SDValue();
5980 Opcode = NVPTXISD::StoreV8;
5981 break;
5982 case NVPTXISD::StoreV8:
5983 // PTX doesn't support the next doubling of operands
5984 return SDValue();
5985 default:
5986 llvm_unreachable("Unhandled store opcode");
5987 }
5988
5989 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5990 // their elements.
5991 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5992 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5993 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5994 return SDValue();
5995
5996 // If the operand has multiple uses, this optimization can increase register
5997 // pressure.
5998 if (!BV.hasOneUse())
5999 return SDValue();
6000
6001 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
6002 // any signs they may be folded by some other pattern or rule.
6003 for (SDValue Op : BV->ops()) {
6004 // Peek through bitcasts
6005 if (Op.getOpcode() == ISD::BITCAST)
6006 Op = Op.getOperand(0);
6007
6008 // This may be folded into a PRMT.
6009 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
6010 Op->getOperand(0).getValueType() == MVT::i32)
6011 return SDValue();
6012
6013 // This may be folded into cvt.bf16x2
6014 if (Op.getOpcode() == ISD::FP_ROUND)
6015 return SDValue();
6016 }
6017 Operands.append({BV.getOperand(0), BV.getOperand(1)});
6018 }
6019 Operands.append(N->op_end() - Back, N->op_end());
6020
6021 // Now we replace the store
6022 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
6023 ST->getMemoryVT(), ST->getMemOperand());
6024}
6025
6027 const NVPTXSubtarget &STI) {
6028
6029 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
6030 // Here is our chance to custom lower a store with a non-simple type.
6031 // Unfortunately, we can't do this in the legalizer because there is no
6032 // way to setOperationAction for an non-simple type.
6034 if (!ST->getValue().getValueType().isSimple())
6035 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
6036 }
6037
6038 return combinePackingMovIntoStore(N, DCI, 1, 2);
6039}
6040
6042 const NVPTXSubtarget &STI) {
6043 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
6044 // Here is our chance to custom lower a load with a non-simple type.
6045 // Unfortunately, we can't do this in the legalizer because there is no
6046 // way to setOperationAction for an non-simple type.
6047 if (!N->getValueType(0).isSimple())
6048 return lowerLoadVector(N, DCI.DAG, STI);
6049 }
6050
6051 return combineUnpackingMovIntoLoad(N, DCI);
6052}
6053
6054/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
6055///
6058 CodeGenOptLevel OptLevel) {
6059 if (OptLevel == CodeGenOptLevel::None)
6060 return SDValue();
6061
6062 SDValue N0 = N->getOperand(0);
6063 SDValue N1 = N->getOperand(1);
6064
6065 // Skip non-integer, non-scalar case
6066 EVT VT = N0.getValueType();
6067 if (VT.isVector() || VT != MVT::i32)
6068 return SDValue();
6069
6070 // First try with the default operand order.
6071 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
6072 return Result;
6073
6074 // If that didn't work, try again with the operands commuted.
6075 return PerformADDCombineWithOperands(N, N1, N0, DCI);
6076}
6077
6078/// Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent
6079/// register pairs (non-coalescable).
6080static bool isNonCoalescableBuildVector(const SDValue &BV) {
6081 if (BV.getOpcode() != ISD::BUILD_VECTOR || BV.getValueType() != MVT::v2f32)
6082 return false;
6083
6084 SDValue Elt0 = BV.getOperand(0);
6085 SDValue Elt1 = BV.getOperand(1);
6086
6087 bool IsExt0 = Elt0.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
6088 bool IsExt1 = Elt1.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
6089
6090 // If neither element is an EXTRACT_VECTOR_ELT they are free-standing
6091 // scalars and the register allocator can still place them side-by-side.
6092 if (!IsExt0 && !IsExt1)
6093 return false;
6094
6095 // If exactly one element is an EXTRACT_VECTOR_ELT, the other is a scalar
6096 // that cannot generally occupy the adjacent register slot.
6097 if (IsExt0 != IsExt1)
6098 return true;
6099
6100 // At this point both sources are extracting from vectors. If they are from
6101 // different vectors, then the BUILD_VECTOR is non-coalescable.
6102 SDValue Src0 = Elt0.getOperand(0);
6103 SDValue Src1 = Elt1.getOperand(0);
6104 if (Src0 != Src1)
6105 return true;
6106
6107 auto *Idx0 = dyn_cast<ConstantSDNode>(Elt0.getOperand(1));
6108 auto *Idx1 = dyn_cast<ConstantSDNode>(Elt1.getOperand(1));
6109 // If both indices are dynamic they will be lowered to
6110 // loads and the vector will be spilled to local memory. The register
6111 // allocator can easily place the results in adjacent registers.
6112 if (!Idx0 && !Idx1)
6113 return false;
6114
6115 // If one index is dynamic and the other is constant, the value from the
6116 // constant load will result in an additional register to pair with the result
6117 // from the dynamic load. We consider this non-coalescable.
6118 if ((Idx0 && !Idx1) || (!Idx0 && Idx1))
6119 return true;
6120
6121 // Both are constant, adjacent pairs are coalescable
6122 return std::abs(Idx0->getSExtValue() - Idx1->getSExtValue()) != 1;
6123}
6124
6125/// Return true if FMUL v2f32 node \p N may be scalarized to fold each lane's
6126/// product into a scalar FMA.
6127bool NVPTXTargetLowering::mayFoldFMULIntoFMA(SDNode *N, MachineFunction &MF,
6128 CodeGenOptLevel OptLevel) const {
6129 if (N->getOpcode() != ISD::FMUL || N->getValueType(0) != MVT::v2f32)
6130 return false;
6131 const bool GlobalFMA = allowFMA(MF, OptLevel);
6132 if (!N->getFlags().hasAllowContract() && !GlobalFMA)
6133 return false;
6134
6135 const SDNode *FirstFAdd = nullptr;
6136 unsigned NumScalarFAdd = 0;
6137
6138 // Both lanes must feed unique FADDs
6139 for (SDNode *EE : N->users()) {
6140 if (NumScalarFAdd == 2)
6141 return false;
6142
6143 if (EE->getOpcode() != ISD::EXTRACT_VECTOR_ELT || !EE->hasOneUse() ||
6144 !isa<ConstantSDNode>(EE->getOperand(1)))
6145 return false;
6146
6147 const SDNode *const FAdd = *EE->users().begin();
6148 if (FAdd->getOpcode() != ISD::FADD ||
6149 (!GlobalFMA && !FAdd->getFlags().hasAllowContract()))
6150 return false;
6151
6152 if (!FirstFAdd)
6153 FirstFAdd = FAdd;
6154 else if (FAdd == FirstFAdd)
6155 return false;
6156
6157 NumScalarFAdd++;
6158 }
6159
6160 return NumScalarFAdd == 2;
6161}
6162
6163/// Scalarize a v2f32 arithmetic node (FADD, FMUL, FSUB, FMA) when at least
6164/// one operand is a BUILD_VECTOR that repacks values from non-adjacent register
6165/// pairs. Without this combine the BUILD_VECTOR forces allocation of a
6166/// temporary 64-bit register, increasing register pressure.
6167///
6168/// Example - before:
6169/// t0: v2f32,v2f32,ch = LoadV2 ...
6170/// t1: f32 = extract_vector_elt t0, 0
6171/// t2: f32 = extract_vector_elt t0:1, 0
6172/// t3: v2f32 = BUILD_VECTOR t1, t2 ;; non-coalescable repack
6173/// t4: v2f32 = fma t_a, t3, t_c
6174///
6175/// After:
6176/// t0: v2f32,v2f32,ch = LoadV2 ...
6177/// t1: f32 = extract_vector_elt t0, 0
6178/// t2: f32 = extract_vector_elt t0:1, 0
6179/// a0: f32 = extract_vector_elt t_a, 0
6180/// a1: f32 = extract_vector_elt t_a, 1
6181/// c0: f32 = extract_vector_elt t_c, 0
6182/// c1: f32 = extract_vector_elt t_c, 1
6183/// r0: f32 = fma a0, t1, c0
6184/// r1: f32 = fma a1, t2, c1
6185/// t4: v2f32 = BUILD_VECTOR r0, r1
6186///
6187/// Also scalarizes an FMUL when all output lanes feed into scalar FADDs
6188/// to enable scalar FMA combining.
6189SDValue NVPTXTargetLowering::performScalarizeV2F32Op(
6191 CodeGenOptLevel OptLevel) const {
6192 EVT VT = N->getValueType(0);
6193 if (VT != MVT::v2f32)
6194 return SDValue();
6195
6196 if (none_of(N->ops(), isNonCoalescableBuildVector) &&
6197 !mayFoldFMULIntoFMA(N, DCI.DAG.getMachineFunction(), OptLevel))
6198 return SDValue();
6199
6200 SelectionDAG &DAG = DCI.DAG;
6201 SDLoc DL(N);
6202 EVT EltVT = VT.getVectorElementType();
6203 unsigned Opc = N->getOpcode();
6204
6205 // For each operand, get the scalar element at the given index: if the operand
6206 // is a BUILD_VECTOR, grab the element directly; otherwise, emit an
6207 // EXTRACT_VECTOR_ELT.
6208 auto GetElement = [&](SDValue Op, unsigned Index) -> SDValue {
6209 if (Op.getOpcode() == ISD::BUILD_VECTOR)
6210 return Op.getOperand(Index);
6211 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
6212 DAG.getVectorIdxConstant(Index, DL));
6213 };
6214
6215 // Build scalar operand lists for element 0 and element 1.
6216 SmallVector<SDValue, 3> Ops0, Ops1;
6217 for (const SDValue &Op : N->ops()) {
6218 Ops0.push_back(GetElement(Op, 0));
6219 Ops1.push_back(GetElement(Op, 1));
6220 }
6221
6222 SDValue Res0 = DAG.getNode(Opc, DL, EltVT, Ops0, N->getFlags());
6223 SDValue Res1 = DAG.getNode(Opc, DL, EltVT, Ops1, N->getFlags());
6224
6225 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Res0, Res1);
6226}
6227
6228/// Target-specific dag combine xforms for ISD::FADD.
6229SDValue
6230NVPTXTargetLowering::performFADDCombine(SDNode *N,
6232 CodeGenOptLevel OptLevel) const {
6233 if (SDValue Result = performScalarizeV2F32Op(N, DCI, OptLevel))
6234 return Result;
6235
6236 SDValue N0 = N->getOperand(0);
6237 SDValue N1 = N->getOperand(1);
6238
6239 EVT VT = N0.getValueType();
6240 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
6241 return SDValue();
6242
6243 // First try with the default operand order.
6244 if (SDValue Result = performFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
6245 return Result;
6246
6247 // If that didn't work, try again with the operands commuted.
6248 return performFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
6249}
6250
6251/// Get 3-input version of a 2-input min/max opcode
6252static unsigned getMinMax3Opcode(unsigned MinMax2Opcode) {
6253 switch (MinMax2Opcode) {
6254 case ISD::FMAXNUM:
6255 case ISD::FMAXIMUMNUM:
6256 return NVPTXISD::FMAXNUM3;
6257 case ISD::FMINNUM:
6258 case ISD::FMINIMUMNUM:
6259 return NVPTXISD::FMINNUM3;
6260 case ISD::FMAXIMUM:
6261 return NVPTXISD::FMAXIMUM3;
6262 case ISD::FMINIMUM:
6263 return NVPTXISD::FMINIMUM3;
6264 default:
6265 llvm_unreachable("Invalid 2-input min/max opcode");
6266 }
6267}
6268
6269/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
6270/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
6273 unsigned PTXVersion, unsigned SmVersion) {
6274
6275 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
6276 EVT VT = N->getValueType(0);
6277 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
6278 return SDValue();
6279
6280 SDValue Op0 = N->getOperand(0);
6281 SDValue Op1 = N->getOperand(1);
6282 unsigned MinMaxOp2 = N->getOpcode();
6283 unsigned MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
6284
6285 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
6286 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
6287 SDValue A = Op0.getOperand(0);
6288 SDValue B = Op0.getOperand(1);
6289 SDValue C = Op1;
6290 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6291 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
6292 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
6293 SDValue A = Op0;
6294 SDValue B = Op1.getOperand(0);
6295 SDValue C = Op1.getOperand(1);
6296 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6297 }
6298 return SDValue();
6299}
6300
6303 CodeGenOptLevel OptLevel) {
6304 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
6305
6306 // Don't do anything at less than -O2.
6307 if (OptLevel < CodeGenOptLevel::Default)
6308 return SDValue();
6309
6310 SelectionDAG &DAG = DCI.DAG;
6311 SDLoc DL(N);
6312 EVT VT = N->getValueType(0);
6313 bool IsSigned = N->getOpcode() == ISD::SREM;
6314 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
6315
6316 const SDValue &Num = N->getOperand(0);
6317 const SDValue &Den = N->getOperand(1);
6318
6319 for (const SDNode *U : Num->users()) {
6320 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
6321 U->getOperand(1) == Den) {
6322 // Num % Den -> Num - (Num / Den) * Den
6323 return DAG.getNode(ISD::SUB, DL, VT, Num,
6324 DAG.getNode(ISD::MUL, DL, VT,
6325 DAG.getNode(DivOpc, DL, VT, Num, Den),
6326 Den));
6327 }
6328 }
6329 return SDValue();
6330}
6331
6332// sext (mul.iN nsw x, y) => mul.wide.sN x, y
6333// zext (mul.iN nuw x, y) => mul.wide.uN x, y
6334// sext (shl.iN nsw x, const) => mul.wide.sN x, (1 << const)
6335// zext (shl.iN nuw x, const) => mul.wide.uN x, (1 << const)
6338 CodeGenOptLevel OptLevel) {
6339 assert(N->getOpcode() == ISD::SIGN_EXTEND ||
6340 N->getOpcode() == ISD::ZERO_EXTEND);
6341
6342 if (OptLevel == CodeGenOptLevel::None)
6343 return SDValue();
6344
6345 SDValue Op = N->getOperand(0);
6346 if (!Op.hasOneUse())
6347 return SDValue();
6348
6349 EVT ToVT = N->getValueType(0);
6350 EVT FromVT = Op.getValueType();
6351 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
6352 (ToVT == MVT::i64 && FromVT == MVT::i32)))
6353 return SDValue();
6354
6355 bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
6356 if ((IsSigned && !Op->getFlags().hasNoSignedWrap()) ||
6357 (!IsSigned && !Op->getFlags().hasNoUnsignedWrap()))
6358 return SDValue();
6359
6360 SDLoc DL(N);
6361 SDValue LHS = Op.getOperand(0);
6362 SDValue RHS = Op.getOperand(1);
6363 unsigned MulWideOpcode =
6364 IsSigned ? NVPTXISD::MUL_WIDE_SIGNED : NVPTXISD::MUL_WIDE_UNSIGNED;
6365 if (Op.getOpcode() == ISD::MUL) {
6366 return DCI.DAG.getNode(MulWideOpcode, DL, ToVT, LHS, RHS);
6367 } else if (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(RHS)) {
6368 const auto ShiftAmt = Op.getConstantOperandVal(1);
6369 const auto MulVal = APInt(FromVT.getSizeInBits(), 1) << ShiftAmt;
6370
6371 // Note that the sext (shl nsw ...) case doesn't work if 1 << const
6372 // overflows to a negative value! The only valid input values in this
6373 // case are 0 and -1 (all other values yield poison because of the nsw),
6374 // and mul.wide.sN would give us the wrong sign for -1. We could use
6375 // mul.wide.uN, but since this is a weird case anyway, we might as well not
6376 // apply this transformation at all.
6377 if (IsSigned && MulVal.isNegative())
6378 return SDValue();
6379
6380 RHS = DCI.DAG.getConstant(MulVal, DL, FromVT);
6381 return DCI.DAG.getNode(MulWideOpcode, DL, ToVT, LHS, RHS);
6382 }
6383
6384 return SDValue();
6385}
6386
6392
6393/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
6394/// that can be demoted to \p OptSize bits without loss of information. The
6395/// signedness of the operand, if determinable, is placed in \p S.
6397 unsigned OptSize,
6398 OperandSignedness &S) {
6399 S = Unknown;
6400
6401 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
6402 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
6403 EVT OrigVT = Op.getOperand(0).getValueType();
6404 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6405 S = Signed;
6406 return true;
6407 }
6408 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
6409 EVT OrigVT = Op.getOperand(0).getValueType();
6410 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6411 S = Unsigned;
6412 return true;
6413 }
6414 }
6415
6416 return false;
6417}
6418
6419/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
6420/// be demoted to \p OptSize bits without loss of information. If the operands
6421/// contain a constant, it should appear as the RHS operand. The signedness of
6422/// the operands is placed in \p IsSigned.
6424 unsigned OptSize,
6425 bool &IsSigned) {
6426 OperandSignedness LHSSign;
6427
6428 // The LHS operand must be a demotable op
6429 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
6430 return false;
6431
6432 // We should have been able to determine the signedness from the LHS
6433 if (LHSSign == Unknown)
6434 return false;
6435
6436 IsSigned = (LHSSign == Signed);
6437
6438 // The RHS can be a demotable op or a constant
6440 const APInt &Val = CI->getAPIntValue();
6441 if (LHSSign == Unsigned) {
6442 return Val.isIntN(OptSize);
6443 } else {
6444 return Val.isSignedIntN(OptSize);
6445 }
6446 } else {
6447 OperandSignedness RHSSign;
6448 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
6449 return false;
6450
6451 return LHSSign == RHSSign;
6452 }
6453}
6454
6455/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
6456/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
6457/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
6458/// amount.
6461 EVT MulType = N->getValueType(0);
6462 if (MulType != MVT::i32 && MulType != MVT::i64) {
6463 return SDValue();
6464 }
6465
6466 SDLoc DL(N);
6467 unsigned OptSize = MulType.getSizeInBits() >> 1;
6468 SDValue LHS = N->getOperand(0);
6469 SDValue RHS = N->getOperand(1);
6470
6471 // Canonicalize the multiply so the constant (if any) is on the right
6472 if (N->getOpcode() == ISD::MUL) {
6473 if (isa<ConstantSDNode>(LHS)) {
6474 std::swap(LHS, RHS);
6475 }
6476 }
6477
6478 // If we have a SHL, determine the actual multiply amount
6479 if (N->getOpcode() == ISD::SHL) {
6481 if (!ShlRHS) {
6482 return SDValue();
6483 }
6484
6485 APInt ShiftAmt = ShlRHS->getAPIntValue();
6486 unsigned BitWidth = MulType.getSizeInBits();
6487 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
6488 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
6489 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
6490 } else {
6491 return SDValue();
6492 }
6493 }
6494
6495 bool Signed;
6496 // Verify that our operands are demotable
6497 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
6498 return SDValue();
6499 }
6500
6501 EVT DemotedVT;
6502 if (MulType == MVT::i32) {
6503 DemotedVT = MVT::i16;
6504 } else {
6505 DemotedVT = MVT::i32;
6506 }
6507
6508 // Truncate the operands to the correct size. Note that these are just for
6509 // type consistency and will (likely) be eliminated in later phases.
6510 SDValue TruncLHS =
6511 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
6512 SDValue TruncRHS =
6513 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
6514
6515 unsigned Opc;
6516 if (Signed) {
6517 Opc = NVPTXISD::MUL_WIDE_SIGNED;
6518 } else {
6519 Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
6520 }
6521
6522 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
6523}
6524
6525static bool isConstOne(const SDValue &Operand) {
6526 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
6527 return Const && Const->getZExtValue() == 1;
6528}
6529
6531 if (Add->getOpcode() != ISD::ADD)
6532 return SDValue();
6533
6534 if (isConstOne(Add->getOperand(0)))
6535 return Add->getOperand(1);
6536
6537 if (isConstOne(Add->getOperand(1)))
6538 return Add->getOperand(0);
6539
6540 return SDValue();
6541}
6542
6545
6547 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6548 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
6549 }
6550
6551 return SDValue();
6552}
6553
6555 SDLoc DL,
6557 if (Select->getOpcode() != ISD::SELECT)
6558 return SDValue();
6559
6560 SDValue Cond = Select->getOperand(0);
6561
6562 unsigned ConstOpNo;
6563 if (isConstOne(Select->getOperand(1)))
6564 ConstOpNo = 1;
6565 else if (isConstOne(Select->getOperand(2)))
6566 ConstOpNo = 2;
6567 else
6568 return SDValue();
6569
6570 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
6571
6572 // Do not combine if the resulting sequence is not obviously profitable.
6574 return SDValue();
6575
6576 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6577
6578 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
6579 (ConstOpNo == 1) ? X : NewMul,
6580 (ConstOpNo == 1) ? NewMul : X);
6581}
6582
6583static SDValue
6586
6587 EVT VT = N0.getValueType();
6588 if (VT.isVector())
6589 return SDValue();
6590
6591 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
6592 return SDValue();
6593
6594 SDLoc DL(N);
6595
6596 // (mul x, (add y, 1)) -> (add (mul x, y), x)
6597 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
6598 return Res;
6599 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
6600 return Res;
6601
6602 // (mul x, (select y, 1)) -> (select (mul x, y), x)
6603 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
6604 return Res;
6605 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
6606 return Res;
6607
6608 return SDValue();
6609}
6610
6611/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
6614 CodeGenOptLevel OptLevel) {
6615 if (OptLevel == CodeGenOptLevel::None)
6616 return SDValue();
6617
6618 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6619 return Ret;
6620
6621 SDValue N0 = N->getOperand(0);
6622 SDValue N1 = N->getOperand(1);
6623 return PerformMULCombineWithOperands(N, N0, N1, DCI);
6624}
6625
6626/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
6629 CodeGenOptLevel OptLevel) {
6630 if (OptLevel > CodeGenOptLevel::None) {
6631 // Try mul.wide combining at OptLevel > 0
6632 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6633 return Ret;
6634 }
6635
6636 return SDValue();
6637}
6638
6641 unsigned int SmVersion) {
6642 EVT CCType = N->getValueType(0);
6643 SDValue A = N->getOperand(0);
6644 SDValue B = N->getOperand(1);
6645
6646 EVT AType = A.getValueType();
6647 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
6648 return SDValue();
6649
6650 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
6651 return SDValue();
6652
6653 SDLoc DL(N);
6654 // setp.f16x2 returns two scalar predicates, which we need to
6655 // convert back to v2i1. The returned result will be scalarized by
6656 // the legalizer, but the comparison will remain a single vector
6657 // instruction.
6658 SDValue CCNode = DCI.DAG.getNode(
6659 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
6661 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
6662 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
6663 CCNode.getValue(1));
6664}
6665
6668 SDValue Vector = peekThroughFreeze(N->getOperand(0));
6669 SDLoc DL(N);
6670 EVT VectorVT = Vector.getValueType();
6671 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6672 IsPTXVectorType(VectorVT.getSimpleVT()))
6673 return SDValue(); // Native vector loads already combine nicely w/
6674 // extract_vector_elt.
6675 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6676 // we already handle them OK.
6677 if (VectorVT.getVectorNumElements() == 1 ||
6678 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6679 return SDValue();
6680
6681 // Don't mess with undef values as sra may be simplified to 0, not undef.
6682 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6683 return SDValue();
6684
6685 uint64_t VectorBits = VectorVT.getSizeInBits();
6686 // We only handle the types we can extract in-register.
6687 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6688 return SDValue();
6689
6690 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6691 // Index == 0 is handled by generic DAG combiner.
6692 if (!Index || Index->getZExtValue() == 0)
6693 return SDValue();
6694
6695 MVT IVT = MVT::getIntegerVT(VectorBits);
6696 EVT EltVT = VectorVT.getVectorElementType();
6697 EVT EltIVT = EltVT.changeTypeToInteger();
6698 uint64_t EltBits = EltVT.getScalarSizeInBits();
6699
6700 SDValue Result = DCI.DAG.getNode(
6701 ISD::TRUNCATE, DL, EltIVT,
6702 DCI.DAG.getNode(
6703 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6704 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6705
6706 // If element has non-integer type, bitcast it back to the expected type.
6707 if (EltVT != EltIVT)
6708 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6709 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6710 if (EltVT != N->getValueType(0))
6711 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6712
6713 return Result;
6714}
6715
6716/// Transform patterns like:
6717/// (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt))
6718/// (select (ult shift_amt, BitWidth), (srl/shl x, shift_amt), 0)
6719/// Into:
6720/// (NVPTXISD::SRL_CLAMP x, shift_amt) or (NVPTXISD::SHL_CLAMP x, shift_amt)
6721///
6722/// These patterns arise from C/C++ code like `shift >= 32 ? 0 : x >> shift`
6723/// which guards against undefined behavior. PTX shr/shl instructions clamp
6724/// shift amounts >= BitWidth to produce 0 for logical shifts, making the
6725/// guard redundant.
6726///
6727/// Note: We only handle SRL and SHL, not SRA, because arithmetic right
6728/// shifts could produce 0 or -1 when shift >= BitWidth.
6729/// Note: We don't handle uge or ule. These don't appear because of
6730/// canonicalization.
6733 if (!DCI.isAfterLegalizeDAG())
6734 return SDValue();
6735
6736 using namespace SDPatternMatch;
6737 unsigned BitWidth = N->getValueType(0).getSizeInBits();
6738 SDValue ShiftAmt, ShiftOp;
6739
6740 // Match logical shifts where the shift amount in the guard matches the shift
6741 // amount in the operation.
6742 auto LogicalShift =
6743 m_AllOf(m_Value(ShiftOp),
6744 m_AnyOf(m_Srl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt))),
6745 m_Shl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt)))));
6746
6747 // shift_amt > BitWidth-1 ? 0 : shift_op
6748 bool MatchedUGT =
6749 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6751 m_SpecificCondCode(ISD::SETUGT)),
6752 m_Zero(), LogicalShift));
6753 // shift_amt < BitWidth ? shift_op : 0
6754 bool MatchedULT =
6755 !MatchedUGT &&
6756 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6758 m_SpecificCondCode(ISD::SETULT)),
6759 LogicalShift, m_Zero()));
6760
6761 if (!MatchedUGT && !MatchedULT)
6762 return SDValue();
6763
6764 // Return a clamp shift operation, which has the same semantics as PTX shift.
6765 unsigned ClampOpc = ShiftOp.getOpcode() == ISD::SRL ? NVPTXISD::SRL_CLAMP
6766 : NVPTXISD::SHL_CLAMP;
6767 return DCI.DAG.getNode(ClampOpc, SDLoc(N), ShiftOp.getValueType(),
6768 ShiftOp.getOperand(0), ShiftOp.getOperand(1));
6769}
6770
6773 SDValue VA = N->getOperand(1);
6774 EVT VectorVT = VA.getValueType();
6775 if (VectorVT != MVT::v4i8)
6776 return SDValue();
6777
6778 // We need to split vselect into individual per-element operations Because we
6779 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6780 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6781 // to/from i16 normally used for i8 values.
6783 SDLoc DL(N);
6784 SDValue VCond = N->getOperand(0);
6785 SDValue VB = N->getOperand(2);
6786 for (int I = 0; I < 4; ++I) {
6787 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6788 DCI.DAG.getConstant(I, DL, MVT::i32));
6789 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6790 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6791 DCI.DAG.getConstant(I, DL, MVT::i32)),
6792 DL, MVT::i32);
6793 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6794 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6795 DCI.DAG.getConstant(I, DL, MVT::i32)),
6796 DL, MVT::i32);
6797 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6798 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6799 }
6800 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6801}
6802
6803static SDValue
6805 auto VT = N->getValueType(0);
6806 if (!DCI.isAfterLegalizeDAG() ||
6807 // only process v2*16 types
6808 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6809 VT.getVectorNumElements() == 2))
6810 return SDValue();
6811
6812 auto Op0 = N->getOperand(0);
6813 auto Op1 = N->getOperand(1);
6814
6815 // Start out by assuming we want to take the lower 2 bytes of each i32
6816 // operand.
6817 uint64_t Op0Bytes = 0x10;
6818 uint64_t Op1Bytes = 0x54;
6819
6820 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6821 {&Op1, &Op1Bytes}};
6822
6823 // Check that each operand is an i16, truncated from an i32 operand. We'll
6824 // select individual bytes from those original operands. Optionally, fold in a
6825 // shift right of that original operand.
6826 for (auto &[Op, OpBytes] : OpData) {
6827 // Eat up any bitcast
6828 if (Op->getOpcode() == ISD::BITCAST)
6829 *Op = Op->getOperand(0);
6830
6831 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6832 Op->getOperand(0).getValueType() == MVT::i32))
6833 return SDValue();
6834
6835 // If the truncate has multiple uses, this optimization can increase
6836 // register pressure
6837 if (!Op->hasOneUse())
6838 return SDValue();
6839
6840 *Op = Op->getOperand(0);
6841
6842 // Optionally, fold in a shift-right of the original operand and let permute
6843 // pick the two higher bytes of the original value directly.
6844 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6845 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6846 // Shift the PRMT byte selector to pick upper bytes from each respective
6847 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6848 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6849 "PRMT selector values out of range");
6850 *OpBytes += 0x22;
6851 *Op = Op->getOperand(0);
6852 }
6853 }
6854 }
6855
6856 SDLoc DL(N);
6857 auto &DAG = DCI.DAG;
6858
6859 auto PRMT =
6860 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6861 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6862 return DAG.getBitcast(VT, PRMT);
6863}
6864
6867 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6868
6869 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6870 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6871
6872 // Fold asc[B -> A](asc[A -> B](x)) -> x
6873 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6874 return ASCN2->getOperand(0);
6875 }
6876
6877 return SDValue();
6878}
6879
6880// Given a constant selector value and a prmt mode, return the selector value
6881// normalized to the generic prmt mode. See the PTX ISA documentation for more
6882// details:
6883// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6884static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6885 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6886
6888 return Selector;
6889
6890 const unsigned V = Selector.trunc(2).getZExtValue();
6891
6892 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6893 unsigned S3) {
6894 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6895 };
6896
6897 switch (Mode) {
6899 return GetSelector(V, V + 1, V + 2, V + 3);
6901 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6903 return GetSelector(V, V, V, V);
6905 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6907 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6909 unsigned V1 = (V & 1) << 1;
6910 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6911 }
6912 default:
6913 llvm_unreachable("Invalid PRMT mode");
6914 }
6915}
6916
6917static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6918 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6919 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6920 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6921 APInt BitField = B.concat(A);
6922 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6923 APInt Result(32, 0);
6924 for (unsigned I : llvm::seq(4U)) {
6925 APInt Sel = SelectorVal.extractBits(4, I * 4);
6926 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6927 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6928 APInt Byte = BitField.extractBits(8, Idx * 8);
6929 if (Sign)
6930 Byte = Byte.ashr(8);
6931 Result.insertBits(Byte, I * 8);
6932 }
6933 return Result;
6934}
6935
6937 CodeGenOptLevel OptLevel) {
6938 if (OptLevel == CodeGenOptLevel::None)
6939 return SDValue();
6940
6941 // Constant fold PRMT
6942 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6943 isa<ConstantSDNode>(N->getOperand(1)) &&
6944 isa<ConstantSDNode>(N->getOperand(2)))
6945 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6946 N->getConstantOperandAPInt(1),
6947 N->getConstantOperandAPInt(2),
6948 N->getConstantOperandVal(3)),
6949 SDLoc(N), N->getValueType(0));
6950 return SDValue();
6951}
6952
6953// During call lowering we wrap the return values in a ProxyReg node which
6954// depend on the chain value produced by the completed call. This ensures that
6955// the full call is emitted in cases where libcalls are used to legalize
6956// operations. To improve the functioning of other DAG combines we pull all
6957// operations we can through one of these nodes, ensuring that the ProxyReg
6958// directly wraps a load. That is:
6959//
6960// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6961//
6964 switch (R.getOpcode()) {
6965 case ISD::TRUNCATE:
6966 case ISD::ANY_EXTEND:
6967 case ISD::SIGN_EXTEND:
6968 case ISD::ZERO_EXTEND:
6969 case ISD::BITCAST: {
6970 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6971 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6972 return SDValue();
6973 }
6974 case ISD::SHL:
6975 case ISD::SRL:
6976 case ISD::SRA:
6977 case ISD::OR: {
6978 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6979 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6980 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6981 return SDValue();
6982 }
6983 case ISD::Constant:
6984 return R;
6985 case ISD::LOAD:
6986 case NVPTXISD::LoadV2:
6987 case NVPTXISD::LoadV4: {
6988 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6989 {Chain, R});
6990 }
6991 case ISD::BUILD_VECTOR: {
6992 if (DCI.isBeforeLegalize())
6993 return SDValue();
6994
6996 for (auto &Op : R->ops()) {
6997 SDValue V = sinkProxyReg(Op, Chain, DCI);
6998 if (!V)
6999 return SDValue();
7000 Ops.push_back(V);
7001 }
7002 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
7003 }
7005 if (DCI.isBeforeLegalize())
7006 return SDValue();
7007
7008 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
7010 R.getValueType(), V, R.getOperand(1));
7011 return SDValue();
7012 }
7013 default:
7014 return SDValue();
7015 }
7016}
7017
7018static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID) {
7019 switch (AddIntrinsicID) {
7020 default:
7021 break;
7022 case Intrinsic::nvvm_add_rn_sat_f16:
7023 case Intrinsic::nvvm_add_rn_sat_v2f16:
7024 return NVPTXISD::SUB_RN_SAT;
7025 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
7026 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
7027 return NVPTXISD::SUB_RN_FTZ_SAT;
7028 }
7029 llvm_unreachable("Invalid F16 add intrinsic");
7030}
7031
7033 Intrinsic::ID AddIntrinsicID) {
7034 SDValue Op1 = N->getOperand(1);
7035 SDValue Op2 = N->getOperand(2);
7036
7037 SDValue SubOp1, SubOp2;
7038
7039 if (Op1.getOpcode() == ISD::FNEG) {
7040 SubOp1 = Op2;
7041 SubOp2 = Op1.getOperand(0);
7042 } else if (Op2.getOpcode() == ISD::FNEG) {
7043 SubOp1 = Op1;
7044 SubOp2 = Op2.getOperand(0);
7045 } else {
7046 return SDValue();
7047 }
7048
7049 SDLoc DL(N);
7050 return DAG.getNode(getF16SubOpc(AddIntrinsicID), DL, N->getValueType(0),
7051 SubOp1, SubOp2);
7052}
7053
7056 const NVPTXSubtarget &STI) {
7057 unsigned IID = N->getConstantOperandVal(0);
7058
7059 switch (IID) {
7060 default:
7061 break;
7062 case Intrinsic::nvvm_add_rn_sat_f16:
7063 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
7064 case Intrinsic::nvvm_add_rn_sat_v2f16:
7065 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
7066 return combineF16AddWithNeg(N, DCI.DAG, IID);
7067 }
7068 return SDValue();
7069}
7070
7073
7074 SDValue Chain = N->getOperand(0);
7075 SDValue Reg = N->getOperand(1);
7076
7077 // If the ProxyReg is not wrapping a load, try to pull the operations through
7078 // the ProxyReg.
7079 if (Reg.getOpcode() != ISD::LOAD) {
7080 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
7081 return V;
7082 }
7083
7084 return SDValue();
7085}
7086
7087SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
7088 DAGCombinerInfo &DCI) const {
7090 switch (N->getOpcode()) {
7091 default:
7092 break;
7093 case ISD::ADD:
7094 return PerformADDCombine(N, DCI, OptLevel);
7095 case ISD::ADDRSPACECAST:
7096 return combineADDRSPACECAST(N, DCI);
7097 case ISD::SIGN_EXTEND:
7098 case ISD::ZERO_EXTEND:
7099 return combineSZExtToMulWide(N, DCI, OptLevel);
7100 case ISD::BUILD_VECTOR:
7101 return PerformBUILD_VECTORCombine(N, DCI);
7103 return PerformEXTRACTCombine(N, DCI);
7104 case ISD::FADD:
7105 return performFADDCombine(N, DCI, OptLevel);
7106 case ISD::FMA:
7107 case ISD::FMUL:
7108 case ISD::FSUB:
7109 return performScalarizeV2F32Op(N, DCI, OptLevel);
7110 case ISD::FMAXNUM:
7111 case ISD::FMINNUM:
7112 case ISD::FMAXIMUM:
7113 case ISD::FMINIMUM:
7114 case ISD::FMAXIMUMNUM:
7115 case ISD::FMINIMUMNUM:
7116 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
7117 STI.getSmVersion());
7118 case ISD::LOAD:
7119 case NVPTXISD::LoadV2:
7120 case NVPTXISD::LoadV4:
7121 return combineLOAD(N, DCI, STI);
7122 case ISD::MUL:
7123 return PerformMULCombine(N, DCI, OptLevel);
7124 case NVPTXISD::PRMT:
7125 return combinePRMT(N, DCI, OptLevel);
7126 case NVPTXISD::ProxyReg:
7127 return combineProxyReg(N, DCI);
7128 case ISD::SETCC:
7129 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
7130 case ISD::SHL:
7131 return PerformSHLCombine(N, DCI, OptLevel);
7132 case ISD::SREM:
7133 case ISD::UREM:
7134 return PerformREMCombine(N, DCI, OptLevel);
7135 case ISD::STORE:
7136 case NVPTXISD::StoreV2:
7137 case NVPTXISD::StoreV4:
7138 return combineSTORE(N, DCI, STI);
7139 case ISD::SELECT:
7140 return PerformSELECTShiftCombine(N, DCI);
7141 case ISD::VSELECT:
7142 return PerformVSELECTCombine(N, DCI);
7144 return combineIntrinsicWOChain(N, DCI, STI);
7145 }
7146 return SDValue();
7147}
7148
7151 // Handle bitcasting to v2i8 without hitting the default promotion
7152 // strategy which goes through stack memory.
7153 SDValue Op(Node, 0);
7154 EVT ToVT = Op->getValueType(0);
7155 if (ToVT != MVT::v2i8) {
7156 return;
7157 }
7158
7159 // Bitcast to i16 and unpack elements into a vector
7160 SDLoc DL(Node);
7161 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
7162 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
7163 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
7164 SDValue Vec1 =
7165 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
7166 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
7167 Results.push_back(
7168 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
7169}
7170
7173 SDValue Chain = N->getOperand(0);
7174 SDValue Intrin = N->getOperand(1);
7175 SDLoc DL(N);
7176
7177 // Get the intrinsic ID
7178 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
7179 switch (IntrinNo) {
7180 default:
7181 return;
7182 case Intrinsic::nvvm_ldu_global_i:
7183 case Intrinsic::nvvm_ldu_global_f:
7184 case Intrinsic::nvvm_ldu_global_p: {
7185 EVT ResVT = N->getValueType(0);
7186
7187 if (ResVT.isVector()) {
7188 // Vector LDG/LDU
7189
7190 unsigned NumElts = ResVT.getVectorNumElements();
7191 EVT EltVT = ResVT.getVectorElementType();
7192
7193 // Since LDU/LDG are target nodes, we cannot rely on DAG type
7194 // legalization.
7195 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
7196 // loaded type to i16 and propagate the "real" type as the memory type.
7197 bool NeedTrunc = false;
7198 if (EltVT.getSizeInBits() < 16) {
7199 EltVT = MVT::i16;
7200 NeedTrunc = true;
7201 }
7202
7203 unsigned Opcode = 0;
7204 SDVTList LdResVTs;
7205
7206 switch (NumElts) {
7207 default:
7208 return;
7209 case 2:
7210 Opcode = NVPTXISD::LDUV2;
7211 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
7212 break;
7213 case 4: {
7214 Opcode = NVPTXISD::LDUV4;
7215 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
7216 LdResVTs = DAG.getVTList(ListVTs);
7217 break;
7218 }
7219 }
7220
7221 SmallVector<SDValue, 8> OtherOps;
7222
7223 // Copy regular operands
7224
7225 OtherOps.push_back(Chain); // Chain
7226 // Skip operand 1 (intrinsic ID)
7227 // Others
7228 OtherOps.append(N->op_begin() + 2, N->op_end());
7229
7231
7232 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
7233 MemSD->getMemoryVT(),
7234 MemSD->getMemOperand());
7235
7236 SmallVector<SDValue, 4> ScalarRes;
7237
7238 for (unsigned i = 0; i < NumElts; ++i) {
7239 SDValue Res = NewLD.getValue(i);
7240 if (NeedTrunc)
7241 Res =
7242 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
7243 ScalarRes.push_back(Res);
7244 }
7245
7246 SDValue LoadChain = NewLD.getValue(NumElts);
7247
7248 SDValue BuildVec =
7249 DAG.getBuildVector(ResVT, DL, ScalarRes);
7250
7251 Results.push_back(BuildVec);
7252 Results.push_back(LoadChain);
7253 } else {
7254 // i8 LDG/LDU
7255 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
7256 "Custom handling of non-i8 ldu/ldg?");
7257
7258 // Just copy all operands as-is
7260
7261 // Force output to i16
7262 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
7263
7265
7266 // We make sure the memory type is i8, which will be used during isel
7267 // to select the proper instruction.
7268 SDValue NewLD =
7270 MVT::i8, MemSD->getMemOperand());
7271
7272 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
7273 NewLD.getValue(0)));
7274 Results.push_back(NewLD.getValue(1));
7275 }
7276 return;
7277 }
7278
7279 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
7280 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
7281 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
7282 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
7283 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
7284 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
7285 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
7286 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
7287 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
7288 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
7289 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
7290 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
7291 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
7292 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
7293 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
7294 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
7295 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
7296 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
7297 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
7298 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
7299 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
7300 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
7301 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
7302 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
7303 if (auto Res = lowerTcgen05Ld(N, DAG)) {
7304 Results.push_back(Res->first);
7305 Results.push_back(Res->second);
7306 }
7307 return;
7308
7309 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
7310 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
7311 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
7312 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
7313 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
7314 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
7315 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
7316 Results.push_back(Res->first);
7317 Results.push_back(Res->second);
7318 }
7319 return;
7320
7321 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
7322 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
7323 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
7324 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
7325 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
7326 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
7327 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
7328 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
7329 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
7330 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
7331 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
7332 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
7333 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32:
7334 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32:
7335 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32:
7336 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32:
7337 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32:
7338 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32:
7339 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32:
7340 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32:
7341 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32:
7342 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32:
7343 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32:
7344 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32:
7345 if (auto Res = lowerTcgen05LdRed(N, DAG)) {
7346 Results.push_back(std::get<0>(*Res));
7347 Results.push_back(std::get<1>(*Res));
7348 Results.push_back(std::get<2>(*Res));
7349 }
7350 return;
7351 }
7352}
7353
7356 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
7357 // result so that it can pass the legalization
7358 SDLoc DL(N);
7359 SDValue Chain = N->getOperand(0);
7360 SDValue Reg = N->getOperand(1);
7361 SDValue Glue = N->getOperand(2);
7362
7363 assert(Reg.getValueType() == MVT::i128 &&
7364 "Custom lowering for CopyFromReg with 128-bit reg only");
7365 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
7366 N->getValueType(2)};
7367 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
7368
7369 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
7370 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
7371 {NewValue.getValue(0), NewValue.getValue(1)});
7372
7373 Results.push_back(Pair);
7374 Results.push_back(NewValue.getValue(2));
7375 Results.push_back(NewValue.getValue(3));
7376}
7377
7379 const TargetLowering &TLI,
7381 SDValue Chain = N->getOperand(0);
7382 SDValue Reg = N->getOperand(1);
7383
7384 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
7385
7386 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
7387 SDValue NewProxy =
7388 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
7389 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
7390
7391 Results.push_back(Res);
7392}
7393
7395 const NVPTXSubtarget &STI,
7397 assert(N->getValueType(0) == MVT::i128 &&
7398 "Custom lowering for atomic128 only supports i128");
7399
7401 SDLoc dl(N);
7402
7403 if (!STI.hasAtomSwap128()) {
7406 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
7407 "requires target sm_90.",
7408 dl.getDebugLoc()));
7409
7410 Results.push_back(DAG.getUNDEF(MVT::i128));
7411 Results.push_back(AN->getOperand(0)); // Chain
7412 return;
7413 }
7414
7416 Ops.push_back(AN->getOperand(0)); // Chain
7417 Ops.push_back(AN->getOperand(1)); // Ptr
7418 for (const auto &Op : AN->ops().drop_front(2)) {
7419 // Low part
7420 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7421 DAG.getIntPtrConstant(0, dl)));
7422 // High part
7423 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7424 DAG.getIntPtrConstant(1, dl)));
7425 }
7426 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
7429 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
7430 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
7431 AN->getMemOperand());
7432 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
7433 {Result.getValue(0), Result.getValue(1)}));
7434 Results.push_back(Result.getValue(2));
7435}
7436
7437void NVPTXTargetLowering::ReplaceNodeResults(
7439 switch (N->getOpcode()) {
7440 default:
7441 report_fatal_error("Unhandled custom legalization");
7442 case ISD::BITCAST:
7443 ReplaceBITCAST(N, DAG, Results);
7444 return;
7445 case ISD::LOAD:
7446 case ISD::MLOAD:
7447 replaceLoadVector(N, DAG, Results, STI);
7448 return;
7451 return;
7452 case ISD::CopyFromReg:
7454 return;
7455 case NVPTXISD::ProxyReg:
7456 replaceProxyReg(N, DAG, *this, Results);
7457 return;
7459 case ISD::ATOMIC_SWAP:
7460 replaceAtomicSwap128(N, DAG, STI, Results);
7461 return;
7462 }
7463}
7464
7467 Type *Ty = AI->getValOperand()->getType();
7468
7469 if (AI->isFloatingPointOperation()) {
7471 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
7472 STI.getPTXVersion() >= 63)
7474 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
7475 STI.getPTXVersion() >= 78)
7477 if (Ty->isFloatTy())
7479 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
7481 }
7483 }
7484
7485 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
7486 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
7487
7488 switch (AI->getOperation()) {
7489 default:
7492 if (BitWidth == 128)
7494 [[fallthrough]];
7498 switch (BitWidth) {
7499 case 8:
7500 case 16:
7502 case 32:
7504 case 64:
7505 if (STI.hasAtomBitwise64())
7508 case 128:
7510 default:
7511 llvm_unreachable("unsupported width encountered");
7512 }
7519 switch (BitWidth) {
7520 case 8:
7521 case 16:
7523 case 32:
7525 case 64:
7526 if (STI.hasAtomMinMax64())
7529 case 128:
7531 default:
7532 llvm_unreachable("unsupported width encountered");
7533 }
7536 switch (BitWidth) {
7537 case 32:
7539 case 8:
7540 case 16:
7541 case 64:
7542 case 128:
7544 default:
7545 llvm_unreachable("unsupported width encountered");
7546 }
7547 }
7548
7550}
7551
7553 const Instruction *I) const {
7554 // This function returns true iff the operation is emulated using a CAS-loop,
7555 // or if it has the memory order seq_cst (which is not natively supported in
7556 // the PTX `atom` instruction).
7557 //
7558 // atomicrmw and cmpxchg instructions not efficiently supported by PTX
7559 // are lowered to CAS emulation loops that preserve their memory order,
7560 // syncscope, and volatile semantics. For PTX, it is more efficient to use
7561 // atom.cas.relaxed.sco instructions within the loop, and fences before and
7562 // after the loop to restore order.
7563 //
7564 // Atomic instructions efficiently supported by PTX are lowered to
7565 // `atom.<op>.<sem>.<scope` instruction with their corresponding memory order
7566 // and scope. Since PTX does not support seq_cst, we emulate it by lowering to
7567 // a fence.sc followed by an atom according to the PTX atomics ABI
7568 // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html
7569 if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I))
7570 return (cast<IntegerType>(CI->getCompareOperand()->getType())
7571 ->getBitWidth() < STI.getMinCmpXchgSizeInBits()) ||
7572 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent;
7573 if (auto *RI = dyn_cast<AtomicRMWInst>(I))
7575 RI->getOrdering() == AtomicOrdering::SequentiallyConsistent;
7576 return false;
7577}
7578
7580 const Instruction *I) const {
7581 // If the operation is emulated by a CAS-loop, we lower the instruction to
7582 // atom.<op>.relaxed, since AtomicExpandPass will insert fences for enforcing
7583 // the correct memory ordering around the CAS loop.
7584 //
7585 // When the operation is not emulated, but the memory order is seq_cst,
7586 // we must lower to "fence.sc.<scope>; atom.<op>.acquire.<scope>;" to conform
7587 // to the PTX atomics ABI.
7588 // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html
7589 // For such cases, emitLeadingFence() will separately insert the leading
7590 // "fence.sc.<scope>;". Here, we only set the memory order to acquire.
7591 //
7592 // Otherwise, the operation is not emulated, and the memory order is not
7593 // seq_cst. In this case, the LLVM memory order is natively supported by the
7594 // PTX `atom` instruction, and we just lower to the corresponding
7595 // `atom.<op>.relaxed|acquire|release|acq_rel". For such cases, this function
7596 // will NOT be called.
7597 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7598 // I before its memory order was modified.
7599 if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
7600 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
7601 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
7602 STI.getMinCmpXchgSizeInBits())
7604 else if (auto *RI = dyn_cast<AtomicRMWInst>(I);
7605 RI && RI->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
7608
7610}
7611
7613 Instruction *Inst,
7614 AtomicOrdering Ord) const {
7615 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7616 // `Inst` before its memory order was modified. We cannot enforce this with an
7617 // assert, because AtomicExpandPass will have modified the memory order
7618 // between the initial call to shouldInsertFencesForAtomic() and the call to
7619 // this function.
7620 if (!isa<AtomicCmpXchgInst>(Inst) && !isa<AtomicRMWInst>(Inst))
7621 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
7622
7623 // Specialize for cmpxchg and atomicrmw
7624 auto SSID = getAtomicSyncScopeID(Inst);
7625 assert(SSID.has_value() && "Expected an atomic operation");
7626
7627 if (isReleaseOrStronger(Ord))
7628 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
7631 SSID.value());
7632
7633 return nullptr;
7634}
7635
7637 Instruction *Inst,
7638 AtomicOrdering Ord) const {
7639 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7640 // `Inst` before its memory order was modified. See `emitLeadingFence` for why
7641 // this cannot be enforced with an assert. Specialize for cmpxchg and
7642 // atomicrmw
7643 auto *CI = dyn_cast<AtomicCmpXchgInst>(Inst);
7644 auto *RI = dyn_cast<AtomicRMWInst>(Inst);
7645 if (!CI && !RI)
7646 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
7647
7648 auto SSID = getAtomicSyncScopeID(Inst);
7649 assert(SSID.has_value() && "Expected an atomic operation");
7650
7651 bool IsEmulated =
7652 CI ? cast<IntegerType>(CI->getCompareOperand()->getType())
7653 ->getBitWidth() < STI.getMinCmpXchgSizeInBits()
7655
7656 if (isAcquireOrStronger(Ord) && IsEmulated)
7657 return Builder.CreateFence(AtomicOrdering::Acquire, SSID.value());
7658
7659 return nullptr;
7660}
7661
7662// Rather than default to SINT when both UINT and SINT are custom, we only
7663// change the opcode when UINT is not legal and SINT is. UINT is preferred when
7664// both are custom since unsigned CVT instructions can lead to slightly better
7665// SASS code with fewer instructions.
7667 EVT ToVT) const {
7668 if (isOperationLegal(Op, ToVT))
7669 return Op;
7670 switch (Op) {
7671 case ISD::FP_TO_UINT:
7673 return ISD::FP_TO_SINT;
7674 break;
7678 break;
7679 case ISD::VP_FP_TO_UINT:
7680 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
7681 return ISD::VP_FP_TO_SINT;
7682 break;
7683 default:
7684 break;
7685 }
7686 return Op;
7687}
7688
7689// Pin NVPTXTargetObjectFile's vtables to this file.
7691
7696
7698 const SelectionDAG &DAG, unsigned Depth) {
7699 SDValue A = Op.getOperand(0);
7700 SDValue B = Op.getOperand(1);
7701 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
7702 unsigned Mode = Op.getConstantOperandVal(3);
7703
7704 if (!Selector)
7705 return;
7706
7707 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
7708 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
7709
7710 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
7711 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
7712 "PRMT must have i32 operands");
7713 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
7714 KnownBits BitField = BKnown.concat(AKnown);
7715
7716 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
7717 for (unsigned I : llvm::seq(4)) {
7718 APInt Sel = SelectorVal.extractBits(4, I * 4);
7719 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7720 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7721 KnownBits Byte = BitField.extractBits(8, Idx * 8);
7722 if (Sign)
7723 Byte = KnownBits::ashr(Byte, KnownBits::makeConstant(APInt(8, 7)));
7724 Known.insertBits(Byte, I * 8);
7725 }
7726}
7727
7728static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
7730
7731 // We can't do anything without knowing the sign bit.
7732 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
7733 if (ExtType == ISD::SEXTLOAD)
7734 return;
7735
7736 // ExtLoading to vector types is weird and may not work well with known bits.
7737 auto DestVT = LD->getValueType(0);
7738 if (DestVT.isVector())
7739 return;
7740
7741 assert(Known.getBitWidth() == DestVT.getSizeInBits());
7742 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
7743 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
7744}
7745
7747 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
7748 const SelectionDAG &DAG, unsigned Depth) const {
7749 Known.resetAll();
7750
7751 switch (Op.getOpcode()) {
7752 case NVPTXISD::PRMT:
7753 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
7754 break;
7755 case NVPTXISD::LoadV2:
7756 case NVPTXISD::LoadV4:
7757 case NVPTXISD::LoadV8:
7759 break;
7760 default:
7761 break;
7762 }
7763}
7764
7765static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
7766 const APInt &DemandedBits) {
7767 APInt DemandedLHS = APInt(32, 0);
7768 APInt DemandedRHS = APInt(32, 0);
7769
7770 for (unsigned I : llvm::seq(4)) {
7771 if (DemandedBits.extractBits(8, I * 8).isZero())
7772 continue;
7773
7774 APInt Sel = SelectorVal.extractBits(4, I * 4);
7775 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7776 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7777
7778 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
7779 unsigned ByteStart = (Idx % 4) * 8;
7780 if (Sign)
7781 Src.setBit(ByteStart + 7);
7782 else
7783 Src.setBits(ByteStart, ByteStart + 8);
7784 }
7785
7786 return {DemandedLHS, DemandedRHS};
7787}
7788
7789// Replace undef with 0 as this is easier for other optimizations such as
7790// known bits.
7792 if (!Op)
7793 return SDValue();
7794 if (Op.isUndef())
7795 return DAG.getConstant(0, SDLoc(), MVT::i32);
7796 return Op;
7797}
7798
7800 const APInt &DemandedBits,
7801 SelectionDAG &DAG,
7802 const TargetLowering &TLI,
7803 unsigned Depth) {
7804 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
7805 SDValue Op0 = PRMT.getOperand(0);
7806 SDValue Op1 = PRMT.getOperand(1);
7807 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
7808 if (!SelectorConst)
7809 return SDValue();
7810
7811 unsigned Mode = PRMT.getConstantOperandVal(3);
7812 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
7813
7814 // Try to simplify the PRMT to one of the inputs if the used bytes are all
7815 // from the same input in the correct order.
7816 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
7817 const unsigned SelBits = (4 - LeadingBytes) * 4;
7818 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
7819 return Op0;
7820 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
7821 return Op1;
7822
7823 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
7824
7825 // Attempt to avoid multi-use ops if we don't need anything from them.
7826 SDValue DemandedOp0 =
7827 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
7828 SDValue DemandedOp1 =
7829 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
7830
7831 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
7832 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
7833 if ((DemandedOp0 && DemandedOp0 != Op0) ||
7834 (DemandedOp1 && DemandedOp1 != Op1)) {
7835 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
7836 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
7837 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
7838 }
7839
7840 return SDValue();
7841}
7842
7844 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
7845 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
7846 Known.resetAll();
7847
7848 switch (Op.getOpcode()) {
7849 case NVPTXISD::PRMT:
7851 *this, Depth)) {
7852 TLO.CombineTo(Op, Result);
7853 return true;
7854 }
7855 break;
7856 default:
7857 break;
7858 }
7859
7860 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
7861 return false;
7862}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
static bool IsIndirectCall(const MachineInstr *MI)
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static SDValue reportInvalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG, unsigned Val)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG, bool hasOffset=false)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue PerformSELECTShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Transform patterns like: (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt)) (select (ult...
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static unsigned getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static unsigned getTcgen05LdRedID(Intrinsic::ID IID)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static Align getArgumentAlignment(const CallBase *CB, Type *Ty, unsigned Idx, const DataLayout &DL)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineSZExtToMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static std::optional< unsigned > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isNonCoalescableBuildVector(const SDValue &BV)
Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent register pairs (non-coalescable...
static bool isConstZero(const SDValue &Operand)
static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE)
static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG)
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static std::pair< MemSDNode *, uint32_t > convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static std::optional< std::tuple< SDValue, SDValue, SDValue > > lowerTcgen05LdRed(SDNode *N, SelectionDAG &DAG)
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG)
static SDValue combineIntrinsicWOChain(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static SDValue combineF16AddWithNeg(SDNode *N, SelectionDAG &DAG, Intrinsic::ID AddIntrinsicID)
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:645
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:640
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:436
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:483
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:433
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
Module * getParent()
Get the module that this global value is contained inside of...
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
static constexpr unsigned NoRegister
Definition MCRegister.h:60
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:573
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:68
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const
bool hasUsedBytesMaskPragma() const
bool hasTensormapReplaceElemtypeSupport(unsigned value) const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
constexpr const char * data() const
Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:138
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3207
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ POISON
POISON - A poison node.
Definition ISDOpcodes.h:236
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:783
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:792
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:815
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:386
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:892
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:809
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ VECREDUCE_FMINIMUM
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:837
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ ABS_MIN_POISON
ABS with a poison result for INT_MIN.
Definition ISDOpcodes.h:751
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ DeviceParam
Definition NVPTX.h:215
@ EntryParam
Definition NVPTX.h:209
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:278
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
SDValue peekThroughFreeze(SDValue V)
Return the non-frozen source operand of V if it exists.
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
std::optional< SyncScope::ID > getAtomicSyncScopeID(const Instruction *I)
A helper function that returns an atomic operation's sync scope; returns std::nullopt if it is not an...
unsigned promoteScalarArgumentSize(unsigned size)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool shouldPassAsArray(Type *Ty)
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os, -Oz
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL)
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isParamGridConstant(const Argument &Arg)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL)
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL)
Since function arguments are passed via .param space, we may want to increase their alignment in a wa...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:323
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:373
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:220
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:247
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:233
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...