LLVM 23.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
19#include "NVPTXSubtarget.h"
20#include "NVPTXTargetMachine.h"
22#include "NVPTXUtilities.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/StringRef.h"
40#include "llvm/IR/Argument.h"
41#include "llvm/IR/Attributes.h"
42#include "llvm/IR/Constants.h"
43#include "llvm/IR/DataLayout.h"
46#include "llvm/IR/FPEnv.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalValue.h"
49#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Instruction.h"
52#include "llvm/IR/IntrinsicsNVPTX.h"
53#include "llvm/IR/Module.h"
54#include "llvm/IR/Type.h"
55#include "llvm/IR/Value.h"
67#include <algorithm>
68#include <cassert>
69#include <cmath>
70#include <cstdint>
71#include <iterator>
72#include <optional>
73#include <string>
74#include <tuple>
75#include <utility>
76#include <vector>
77
78#define DEBUG_TYPE "nvptx-lower"
79
80using namespace llvm;
81
83 "nvptx-sched4reg",
84 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
85
87 "nvptx-fma-level", cl::Hidden,
88 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
89 " 1: do it 2: do it aggressively"),
90 cl::init(2));
91
93 "nvptx-prec-divf32", cl::Hidden,
95 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
97 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
98 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
100 "Use IEEE Compliant F32 div.rnd if available (default)"),
102 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
104
106 "nvptx-prec-sqrtf32", cl::Hidden,
107 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
108 cl::init(true));
109
110/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
111/// does NOT use lg2.approx for log2, so this is disabled by default.
113 "nvptx-approx-log2f32",
114 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
115 cl::init(false));
116
118 "nvptx-force-min-byval-param-align", cl::Hidden,
119 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
120 " params of device functions."),
121 cl::init(false));
122
125 const SDNode &N) const {
126 // If nvptx-prec-div32=N is used on the command-line, always honor it
127 if (UsePrecDivF32.getNumOccurrences() > 0)
128 return UsePrecDivF32;
129
130 const SDNodeFlags Flags = N.getFlags();
131 if (Flags.hasApproximateFuncs())
133
135}
136
138 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
139 if (UsePrecSqrtF32.getNumOccurrences() > 0)
140 return UsePrecSqrtF32;
141
142 if (N) {
143 const SDNodeFlags Flags = N->getFlags();
144 if (Flags.hasApproximateFuncs())
145 return false;
146 }
147
148 return true;
149}
150
155
156static bool IsPTXVectorType(MVT VT) {
157 switch (VT.SimpleTy) {
158 default:
159 return false;
160 case MVT::v2i1:
161 case MVT::v4i1:
162 case MVT::v2i8:
163 case MVT::v4i8:
164 case MVT::v8i8: // <2 x i8x4>
165 case MVT::v16i8: // <4 x i8x4>
166 case MVT::v2i16:
167 case MVT::v4i16:
168 case MVT::v8i16: // <4 x i16x2>
169 case MVT::v2i32:
170 case MVT::v4i32:
171 case MVT::v2i64:
172 case MVT::v2f16:
173 case MVT::v4f16:
174 case MVT::v8f16: // <4 x f16x2>
175 case MVT::v2bf16:
176 case MVT::v4bf16:
177 case MVT::v8bf16: // <4 x bf16x2>
178 case MVT::v2f32:
179 case MVT::v4f32:
180 case MVT::v2f64:
181 case MVT::v4i64:
182 case MVT::v4f64:
183 case MVT::v8i32:
184 case MVT::v8f32:
185 case MVT::v16f16: // <8 x f16x2>
186 case MVT::v16bf16: // <8 x bf16x2>
187 case MVT::v16i16: // <8 x i16x2>
188 case MVT::v32i8: // <8 x i8x4>
189 return true;
190 }
191}
192
193// When legalizing vector loads/stores, this function is called, which does two
194// things:
195// 1. Determines Whether the vector is something we want to custom lower,
196// std::nullopt is returned if we do not want to custom lower it.
197// 2. If we do want to handle it, returns two parameters:
198// - unsigned int NumElts - The number of elements in the final vector
199// - EVT EltVT - The type of the elements in the final vector
200static std::optional<std::pair<unsigned int, MVT>>
202 unsigned AddressSpace) {
203 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
204
205 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
206 VectorEVT.getSizeInBits() == 256)
207 return {{4, MVT::i64}};
208
209 if (!VectorEVT.isSimple())
210 return std::nullopt;
211 const MVT VectorVT = VectorEVT.getSimpleVT();
212
213 if (!VectorVT.isVector()) {
214 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
215 return {{2, MVT::i64}};
216 return std::nullopt;
217 }
218
219 const MVT EltVT = VectorVT.getVectorElementType();
220 const unsigned NumElts = VectorVT.getVectorNumElements();
221
222 // The size of the PTX virtual register that holds a packed type.
223 unsigned PackRegSize;
224
225 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
226 // legal. We can (and should) split that into 2 stores of <2 x double> here
227 // but I'm leaving that as a TODO for now.
228 switch (VectorVT.SimpleTy) {
229 default:
230 return std::nullopt;
231
232 case MVT::v4i64:
233 case MVT::v4f64:
234 // This is a "native" vector type iff the address space is global and the
235 // target supports 256-bit loads/stores
236 if (!CanLowerTo256Bit)
237 return std::nullopt;
238 [[fallthrough]];
239 case MVT::v2i8:
240 case MVT::v2i64:
241 case MVT::v2f64:
242 // This is a "native" vector type
243 return std::pair(NumElts, EltVT);
244
245 case MVT::v16f16: // <8 x f16x2>
246 case MVT::v16bf16: // <8 x bf16x2>
247 case MVT::v16i16: // <8 x i16x2>
248 case MVT::v32i8: // <8 x i8x4>
249 // This can be upsized into a "native" vector type iff the address space is
250 // global and the target supports 256-bit loads/stores.
251 if (!CanLowerTo256Bit)
252 return std::nullopt;
253 [[fallthrough]];
254 case MVT::v2i16: // <1 x i16x2>
255 case MVT::v2f16: // <1 x f16x2>
256 case MVT::v2bf16: // <1 x bf16x2>
257 case MVT::v4i8: // <1 x i8x4>
258 case MVT::v4i16: // <2 x i16x2>
259 case MVT::v4f16: // <2 x f16x2>
260 case MVT::v4bf16: // <2 x bf16x2>
261 case MVT::v8i8: // <2 x i8x4>
262 case MVT::v8f16: // <4 x f16x2>
263 case MVT::v8bf16: // <4 x bf16x2>
264 case MVT::v8i16: // <4 x i16x2>
265 case MVT::v16i8: // <4 x i8x4>
266 PackRegSize = 32;
267 break;
268
269 case MVT::v8f32: // <4 x f32x2>
270 case MVT::v8i32: // <4 x i32x2>
271 // This is a "native" vector type iff the address space is global and the
272 // target supports 256-bit loads/stores
273 if (!CanLowerTo256Bit)
274 return std::nullopt;
275 [[fallthrough]];
276 case MVT::v2f32: // <1 x f32x2>
277 case MVT::v4f32: // <2 x f32x2>
278 case MVT::v2i32: // <1 x i32x2>
279 case MVT::v4i32: // <2 x i32x2>
280 if (!STI.hasF32x2Instructions())
281 return std::pair(NumElts, EltVT);
282 PackRegSize = 64;
283 break;
284 }
285
286 // If we reach here, then we can pack 2 or more elements into a single 32-bit
287 // or 64-bit PTX register and treat the vector as a new vector containing
288 // packed elements.
289
290 // Number of elements to pack in one word.
291 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
292
293 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
294}
295
296/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
297/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
298/// the types as required by the calling convention (with special handling for
299/// i8s).
300/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
301/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
302/// LowerCall, and LowerReturn.
303static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
304 LLVMContext &Ctx, CallingConv::ID CallConv,
305 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
307 uint64_t StartingOffset = 0) {
308 SmallVector<EVT, 16> TempVTs;
309 SmallVector<uint64_t, 16> TempOffsets;
310 ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
311 StartingOffset);
312
313 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
314 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
315 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
316
317 // Since we actually can load/store b8, we need to ensure that we'll use
318 // the original sized type for any i8s or i8 vectors.
319 if (VT.getScalarType() == MVT::i8) {
320 if (RegisterVT == MVT::i16)
321 RegisterVT = MVT::i8;
322 else if (RegisterVT == MVT::v2i16)
323 RegisterVT = MVT::v2i8;
324 else
325 assert(RegisterVT == MVT::v4i8 &&
326 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
327 }
328
329 // TODO: This is horribly incorrect for cases where the vector elements are
330 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
331 // has existed for as long as NVPTX has and no one has complained, so we'll
332 // leave it for now.
333 for (unsigned I : seq(NumRegs)) {
334 ValueVTs.push_back(RegisterVT);
335 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
336 }
337 }
338}
339
340// We return an EVT that can hold N VTs
341// If the VT is a vector, the resulting EVT is a flat vector with the same
342// element type as VT's element type.
343static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
344 if (N == 1)
345 return VT;
346
347 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
348 VT.getVectorNumElements() * N)
349 : EVT::getVectorVT(C, VT, N);
350}
351
353 const SDLoc &dl, SelectionDAG &DAG) {
354 if (V.getValueType() == VT) {
355 assert(I == 0 && "Index must be 0 for scalar value");
356 return V;
357 }
358
359 if (!VT.isVector())
360 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
361 DAG.getVectorIdxConstant(I, dl));
362
363 return DAG.getNode(
364 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
366}
367
368template <typename T>
369static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
370 SelectionDAG &DAG, T GetElement) {
371 if (N == 1)
372 return GetElement(0);
373
375 for (const unsigned I : llvm::seq(N)) {
376 SDValue Val = GetElement(I);
377 if (Val.getValueType().isVector())
378 DAG.ExtractVectorElements(Val, Values);
379 else
380 Values.push_back(Val);
381 }
382
383 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
384 Values.size());
385 return DAG.getBuildVector(VT, dl, Values);
386}
387
388/// PromoteScalarIntegerPTX
389/// Used to make sure the arguments/returns are suitable for passing
390/// and promote them to a larger size if they're not.
391///
392/// The promoted type is placed in \p PromoteVT if the function returns true.
394 if (VT.isScalarInteger()) {
395 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
396 default:
398 "Promotion is not suitable for scalars of size larger than 64-bits");
399 case 1:
400 return MVT::i1;
401 case 2:
402 case 4:
403 case 8:
404 return MVT::i8;
405 case 16:
406 return MVT::i16;
407 case 32:
408 return MVT::i32;
409 case 64:
410 return MVT::i64;
411 }
412 }
413 return VT;
414}
415
416// Check whether we can merge loads/stores of some of the pieces of a
417// flattened function parameter or return value into a single vector
418// load/store.
419//
420// The flattened parameter is represented as a list of EVTs and
421// offsets, and the whole structure is aligned to ParamAlignment. This
422// function determines whether we can load/store pieces of the
423// parameter starting at index Idx using a single vectorized op of
424// size AccessSize. If so, it returns the number of param pieces
425// covered by the vector op. Otherwise, it returns 1.
426template <typename T>
428 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
429 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
430
431 // Can't vectorize if param alignment is not sufficient.
432 if (ParamAlignment < AccessSize)
433 return 1;
434 // Can't vectorize if offset is not aligned.
435 if (Offsets[Idx] & (AccessSize - 1))
436 return 1;
437
438 EVT EltVT = ValueVTs[Idx];
439 unsigned EltSize = EltVT.getStoreSize();
440
441 // Element is too large to vectorize.
442 if (EltSize >= AccessSize)
443 return 1;
444
445 unsigned NumElts = AccessSize / EltSize;
446 // Can't vectorize if AccessBytes if not a multiple of EltSize.
447 if (AccessSize != EltSize * NumElts)
448 return 1;
449
450 // We don't have enough elements to vectorize.
451 if (Idx + NumElts > ValueVTs.size())
452 return 1;
453
454 // PTX ISA can only deal with 2- and 4-element vector ops.
455 if (NumElts != 4 && NumElts != 2)
456 return 1;
457
458 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
459 // Types do not match.
460 if (ValueVTs[j] != EltVT)
461 return 1;
462
463 // Elements are not contiguous.
464 if (Offsets[j] - Offsets[j - 1] != EltSize)
465 return 1;
466 }
467 // OK. We can vectorize ValueVTs[i..i+NumElts)
468 return NumElts;
469}
470
471// Computes whether and how we can vectorize the loads/stores of a
472// flattened function parameter or return value.
473//
474// The flattened parameter is represented as the list of ValueVTs and
475// Offsets, and is aligned to ParamAlignment bytes. We return a vector
476// of the same size as ValueVTs indicating how each piece should be
477// loaded/stored (i.e. as a scalar, or as part of a vector
478// load/store).
479template <typename T>
482 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
483 bool IsVAArg = false) {
484 // Set vector size to match ValueVTs and mark all elements as
485 // scalars by default.
486
487 if (IsVAArg)
488 return SmallVector<unsigned>(ValueVTs.size(), 1);
489
490 SmallVector<unsigned, 16> VectorInfo;
491
492 const auto GetNumElts = [&](unsigned I) -> unsigned {
493 for (const unsigned AccessSize : {16, 8, 4, 2}) {
494 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
495 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
496 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
497 "Unexpected vectorization size");
498 if (NumElts != 1)
499 return NumElts;
500 }
501 return 1;
502 };
503
504 // Check what we can vectorize using 128/64/32-bit accesses.
505 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
506 const unsigned NumElts = GetNumElts(I);
507 VectorInfo.push_back(NumElts);
508 I += NumElts;
509 }
510 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
511 ValueVTs.size());
512 return VectorInfo;
513}
514
515// NVPTXTargetLowering Constructor.
517 const NVPTXSubtarget &STI)
518 : TargetLowering(TM, STI), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
519 // always lower memset, memcpy, and memmove intrinsics to load/store
520 // instructions, rather
521 // then generating calls to memset, mempcy or memmove.
525
528
529 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
530 // condition branches.
531 setJumpIsExpensive(true);
532
533 // Wide divides are _very_ slow. Try to reduce the width of the divide if
534 // possible.
535 addBypassSlowDiv(64, 32);
536
537 // By default, use the Source scheduling
538 if (sched4reg)
540 else
542
543 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
544 LegalizeAction NoF16Action) {
545 bool IsOpSupported = STI.allowFP16Math();
546 switch (Op) {
547 // Several FP16 instructions are available on sm_80 only.
548 case ISD::FMINNUM:
549 case ISD::FMAXNUM:
552 case ISD::FMAXIMUM:
553 case ISD::FMINIMUM:
554 case ISD::FMAXIMUMNUM:
555 case ISD::FMINIMUMNUM:
556 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
557 break;
558 case ISD::FEXP2:
559 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
560 break;
561 }
562 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
563 };
564
565 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
566 LegalizeAction NoBF16Action) {
567 bool IsOpSupported = STI.hasNativeBF16Support(Op);
569 Op, VT, IsOpSupported ? Action : NoBF16Action);
570 };
571
572 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
573 LegalizeAction NoI16x2Action) {
574 bool IsOpSupported = false;
575 // instructions are available on sm_90 only
576 switch (Op) {
577 case ISD::ADD:
578 case ISD::SMAX:
579 case ISD::SMIN:
580 case ISD::UMIN:
581 case ISD::UMAX:
582 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
583 break;
584 }
585 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
586 };
587
588 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
589 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
590 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
591 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
592 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
593 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
594 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
595 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
596 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
597 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
598 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
599 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
600
601 if (STI.hasF32x2Instructions()) {
602 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
603 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
604 }
605
606 // Conversion to/from FP16/FP16x2 is always legal.
611
613 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
615
616 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
617 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
618
619 // Conversion to/from BFP16/BFP16x2 is always legal.
624
625 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
626 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
627 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
628 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
629
630 // Conversion to/from i16/i16x2 is always legal.
635
640
641 // No support for these operations with v2f32/v2i32
642 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
643 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
644
647 MVT::v2i32, Expand);
648
649 // Need custom lowering in case the index is dynamic.
650 if (STI.hasF32x2Instructions())
651 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
652 Custom);
653
654 // Custom conversions to/from v2i8.
656
657 // Only logical ops can be done on v4i8/v2i32 directly, others must be done
658 // elementwise.
675 {MVT::v4i8, MVT::v2i32}, Expand);
676
677 // Operations not directly supported by NVPTX.
678 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
679 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
680 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
683 }
684
685 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
686 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
687
688 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
689 // For others we will expand to a SHL/SRA pair.
695 setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
696
703
706
708 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
709 Expand);
710
711 if (STI.hasHWROT32()) {
714 Custom);
715 }
716
717 setOperationAction(ISD::BR_JT, MVT::Other, STI.hasBrx() ? Legal : Expand);
719
720 // We want to legalize constant related memmove and memcopy
721 // intrinsics.
723
724 // FP extload/truncstore is not legal in PTX. We need to expand all these.
725 for (auto FloatVTs :
727 for (MVT ValVT : FloatVTs) {
728 for (MVT MemVT : FloatVTs) {
729 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
730 setTruncStoreAction(ValVT, MemVT, Expand);
731 }
732 }
733 }
734
735 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
736 // how they'll be lowered in ISel anyway, and by doing this a little earlier
737 // we allow for more DAG combine opportunities.
738 for (auto IntVTs :
740 for (MVT ValVT : IntVTs)
741 for (MVT MemVT : IntVTs)
742 if (isTypeLegal(ValVT))
743 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
744
745 // PTX does not support load / store predicate registers
747 for (MVT VT : MVT::integer_valuetypes()) {
749 Promote);
750 setTruncStoreAction(VT, MVT::i1, Expand);
751 }
752
753 // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
754 // expansion for these nodes when they are unaligned is incorrect if the
755 // type is a vector.
756 //
757 // TODO: Fix the generic expansion for these nodes found in
758 // TargetLowering::expandUnalignedLoad/Store.
760 MVT::v2i8, Expand);
762 {MVT::v2i8, MVT::v2i16}, Expand);
763 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
764 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
765 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
766
767 // Register custom handling for illegal type loads/stores. We'll try to custom
768 // lower almost all illegal types and logic in the lowering will discard cases
769 // we can't handle.
770 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::i256, MVT::f128},
771 Custom);
773 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
775 Custom);
776
777 // Custom legalization for LDU intrinsics.
778 // TODO: The logic to lower these is not very robust and we should rewrite it.
779 // Perhaps LDU should not be represented as an intrinsic at all.
782 if (IsPTXVectorType(VT))
784
788 MVT::i1, Expand);
789
790 // This is legal in NVPTX
795
796 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
798
799 // TRAP can be lowered to PTX trap
800 setOperationAction(ISD::TRAP, MVT::Other, Legal);
801 // DEBUGTRAP can be lowered to PTX brkpt
803
804 // Support varargs.
809
811 {MVT::i16, MVT::i32, MVT::i64}, Legal);
812
814 Promote);
817
818 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
819 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
820 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
821 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
822 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
823 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
824 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
825
826 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
827 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
828 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
829 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
830 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
831 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
832
833 // Other arithmetic and logic ops are unsupported.
837 {MVT::v2i16, MVT::v2i32}, Expand);
838
839 // v2i32 is not supported for any arithmetic operations
844 MVT::v2i32, Expand);
845
850 if (STI.getPTXVersion() >= 43) {
855 }
856
858 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
861
862 // PTX does not directly support SELP of i1, so promote to i32 first
864
865 // PTX cannot multiply two i64s in a single instruction.
868
869 // We have some custom DAG combine patterns for these nodes
871 ISD::AND,
873 ISD::FADD,
880 ISD::MUL,
882 ISD::SHL,
883 ISD::SREM,
884 ISD::UREM,
888 ISD::LOAD,
893
894 // setcc for f16x2 and bf16x2 needs special handling to prevent
895 // legalizer's attempt to scalarize it due to v2i1 not being legal.
896 if (STI.allowFP16Math() || STI.hasBF16Math())
898
899 // Vector reduction operations. These may be turned into shuffle or tree
900 // reductions depending on what instructions are available for each type.
902 MVT EltVT = VT.getVectorElementType();
903 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
906 VT, Custom);
907 }
908 }
909
910 // Promote fp16 arithmetic if fp16 hardware isn't available or the
911 // user passed --nvptx-no-fp16-math. The flag is useful because,
912 // although sm_53+ GPUs have some sort of FP16 support in
913 // hardware, only sm_53 and sm_60 have full implementation. Others
914 // only have token amount of hardware and are likely to run faster
915 // by using fp32 units instead.
916 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
917 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
918 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
919 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
920 // bf16 must be promoted to f32.
921 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
922 if (getOperationAction(Op, MVT::bf16) == Promote)
923 AddPromotedToType(Op, MVT::bf16, MVT::f32);
924 setOperationAction(Op, MVT::v2f32,
925 STI.hasF32x2Instructions() ? Legal : Expand);
926 }
927
928 // On SM80, we select add/mul/sub as fma to avoid promotion to float
929 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
930 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
931 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
933 }
934 }
935 }
936
937 // f16/f16x2 neg was introduced in PTX 60, SM_53.
938 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
939 STI.getPTXVersion() >= 60 &&
940 STI.allowFP16Math();
941 for (const auto &VT : {MVT::f16, MVT::v2f16})
943 IsFP16FP16x2NegAvailable ? Legal : Expand);
944
945 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
946 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
947 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
948 // (would be) Library functions.
949
950 // These map to conversion instructions for scalar FP types.
951 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
953 setOperationAction(Op, MVT::f16, Legal);
954 setOperationAction(Op, MVT::f32, Legal);
955 setOperationAction(Op, MVT::f64, Legal);
956 setOperationAction(Op, MVT::v2f16, Expand);
957 setOperationAction(Op, MVT::v2bf16, Expand);
958 setOperationAction(Op, MVT::v2f32, Expand);
959 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
960 if (getOperationAction(Op, MVT::bf16) == Promote)
961 AddPromotedToType(Op, MVT::bf16, MVT::f32);
962 }
963
964 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
966 }
967 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
968 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
971 }
972 }
973
974 // Expand v2f32 = fp_extend
976 // Expand v2[b]f16 = fp_round v2f32
977 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
978
979 // sm_80 only has conversions between f32 and bf16. Custom lower all other
980 // bf16 conversions.
981 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
982 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
985 VT, Custom);
986 }
989 MVT::bf16, Custom);
990 }
991
998 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
999
1000 // 'Expand' implements FCOPYSIGN without calling an external library.
1007
1008 // These map to corresponding instructions for f32/f64. f16 must be
1009 // promoted to f32. v2f16 is expanded to f16, which is then promoted
1010 // to f32.
1011 for (const auto &Op :
1013 setOperationAction(Op, MVT::f16, Promote);
1014 setOperationAction(Op, MVT::f32, Legal);
1015 // only div/rem/sqrt are legal for f64
1016 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
1017 setOperationAction(Op, MVT::f64, Legal);
1018 }
1019 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
1020 setOperationAction(Op, MVT::bf16, Promote);
1021 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1022 }
1023 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
1024
1025 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
1026 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1027 if (STI.getPTXVersion() >= 65) {
1028 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1029 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1030 } else {
1032 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1033 }
1034 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1035 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1036 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1037 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1038
1039 for (const auto &Op :
1041 setOperationAction(Op, MVT::f32, Legal);
1042 setOperationAction(Op, MVT::f64, Legal);
1043 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1044 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1045 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1046 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1047 if (getOperationAction(Op, MVT::bf16) == Promote)
1048 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1049 setOperationAction(Op, MVT::v2f32, Expand);
1050 }
1051 bool SupportsF32MinMaxNaN =
1052 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1053 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1054 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1055 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1056 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1057 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1058 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1059 setOperationAction(Op, MVT::v2f32, Expand);
1060 }
1061
1062 // Custom lowering for inline asm with 128-bit operands
1065
1066 // FEXP2 support:
1067 // - f32
1068 // - f16/f16x2 (sm_70+, PTX 7.0+)
1069 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1070 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1072 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1073 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1074 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1075 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1076 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1077
1078 // FLOG2 supports f32 only
1079 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1080 if (UseApproxLog2F32) {
1082 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1083 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1084 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1085 Expand);
1086 }
1087
1088 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1089
1090 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1091
1092 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1093 // type, we need to custom lower it.
1095 Custom);
1096
1097 // Now deduce the information based on the above mentioned
1098 // actions
1099 computeRegisterProperties(STI.getRegisterInfo());
1100
1101 // PTX support for 16-bit CAS is emulated. Only use 32+
1102 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1103 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1105
1106 // Custom lowering for tcgen05.ld vector operands
1108 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1109 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::v2f32,
1110 MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32,
1111 MVT::v64f32, MVT::v128f32},
1112 Custom);
1113
1114 // Custom lowering for tcgen05.st vector operands
1116 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1117 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
1118 Custom);
1119
1120 // Enable custom lowering for the following:
1121 // * MVT::i128 - clusterlaunchcontrol
1122 // * MVT::i32 - prmt
1123 // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
1124 // * MVT::Other - internal.addrspace.wrap
1126 {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
1127
1128 // Custom lowering for bswap
1129 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
1130 Custom);
1131}
1132
1135 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1136 VT.getScalarType() == MVT::i1)
1137 return TypeSplitVector;
1139}
1140
1142 int Enabled, int &ExtraSteps,
1143 bool &UseOneConst,
1144 bool Reciprocal) const {
1147 return SDValue();
1148
1149 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1150 ExtraSteps = 0;
1151
1152 SDLoc DL(Operand);
1153 EVT VT = Operand.getValueType();
1154 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1155
1156 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1157 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1158 DAG.getConstant(IID, DL, MVT::i32), Operand);
1159 };
1160
1161 // The sqrt and rsqrt refinement processes assume we always start out with an
1162 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1163 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1164 // any refinement, we must return a regular sqrt.
1165 if (Reciprocal || ExtraSteps > 0) {
1166 if (VT == MVT::f32)
1167 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1168 : Intrinsic::nvvm_rsqrt_approx_f);
1169 else if (VT == MVT::f64)
1170 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1171 else
1172 return SDValue();
1173 } else {
1174 if (VT == MVT::f32)
1175 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1176 : Intrinsic::nvvm_sqrt_approx_f);
1177 else {
1178 // There's no sqrt.approx.f64 instruction, so we emit
1179 // reciprocal(rsqrt(x)). This is faster than
1180 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1181 // x * rsqrt(x).)
1182 return DAG.getNode(
1184 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1185 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1186 }
1187 }
1188}
1189
1191 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1193 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1194 unsigned UniqueCallSite) const {
1195 auto PtrVT = getPointerTy(DL);
1196
1197 std::string Prototype;
1198 raw_string_ostream O(Prototype);
1199 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1200
1201 if (RetTy->isVoidTy()) {
1202 O << "()";
1203 } else {
1204 O << "(";
1205 if (shouldPassAsArray(RetTy)) {
1206 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1207 O << ".param .align " << RetAlign.value() << " .b8 _["
1208 << DL.getTypeAllocSize(RetTy) << "]";
1209 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1210 unsigned size = 0;
1211 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1212 size = ITy->getBitWidth();
1213 } else {
1214 assert(RetTy->isFloatingPointTy() &&
1215 "Floating point type expected here");
1216 size = RetTy->getPrimitiveSizeInBits();
1217 }
1218 // PTX ABI requires all scalar return values to be at least 32
1219 // bits in size. fp16 normally uses .b16 as its storage type in
1220 // PTX, so its size must be adjusted here, too.
1222
1223 O << ".param .b" << size << " _";
1224 } else if (isa<PointerType>(RetTy)) {
1225 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1226 } else {
1227 llvm_unreachable("Unknown return type");
1228 }
1229 O << ") ";
1230 }
1231 O << "_ (";
1232
1233 bool first = true;
1234
1235 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1236 auto AllOuts = ArrayRef(Outs);
1237 for (const unsigned I : llvm::seq(NumArgs)) {
1238 const auto ArgOuts =
1239 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1240 AllOuts = AllOuts.drop_front(ArgOuts.size());
1241
1242 Type *Ty = Args[I].Ty;
1243 if (!first) {
1244 O << ", ";
1245 }
1246 first = false;
1247
1248 if (ArgOuts[0].Flags.isByVal()) {
1249 // Indirect calls need strict ABI alignment so we disable optimizations by
1250 // not providing a function to optimize.
1251 Type *ETy = Args[I].IndirectType;
1252 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1253 Align ParamByValAlign =
1254 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1255
1256 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1257 << ArgOuts[0].Flags.getByValSize() << "]";
1258 } else {
1259 if (shouldPassAsArray(Ty)) {
1260 Align ParamAlign =
1261 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1262 O << ".param .align " << ParamAlign.value() << " .b8 _["
1263 << DL.getTypeAllocSize(Ty) << "]";
1264 continue;
1265 }
1266 // i8 types in IR will be i16 types in SDAG
1267 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1268 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1269 "type mismatch between callee prototype and arguments");
1270 // scalar type
1271 unsigned sz = 0;
1272 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1273 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1274 } else if (isa<PointerType>(Ty)) {
1275 sz = PtrVT.getSizeInBits();
1276 } else {
1277 sz = Ty->getPrimitiveSizeInBits();
1278 }
1279 O << ".param .b" << sz << " _";
1280 }
1281 }
1282
1283 if (FirstVAArg)
1284 O << (first ? "" : ",") << " .param .align "
1285 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1286 O << ")";
1287 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1288 O << " .noreturn";
1289 O << ";";
1290
1291 return Prototype;
1292}
1293
1295 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1296 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1297}
1298
1299Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1300 unsigned Idx,
1301 const DataLayout &DL) const {
1302 if (!CB) {
1303 // CallSite is zero, fallback to ABI type alignment
1304 return DL.getABITypeAlign(Ty);
1305 }
1306
1307 const Function *DirectCallee = CB->getCalledFunction();
1308
1309 if (!DirectCallee) {
1310 // We don't have a direct function symbol, but that may be because of
1311 // constant cast instructions in the call.
1312
1313 // With bitcast'd call targets, the instruction will be the call
1314 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1315 // Check if we have call alignment metadata
1316 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1317 return StackAlign.value();
1318 }
1319 DirectCallee = getMaybeBitcastedCallee(CB);
1320 }
1321
1322 // Check for function alignment information if we found that the
1323 // ultimate target is a Function
1324 if (DirectCallee)
1325 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1326
1327 // Call is indirect, fall back to the ABI type alignment
1328 return DL.getABITypeAlign(Ty);
1329}
1330
1332 const GlobalAddressSDNode *Func) {
1333 if (!Func)
1334 return false;
1335 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1336 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1337 return false;
1338}
1339
1341 const DataLayout &DL,
1342 const TargetLowering &TL) {
1343 if (Ptr->getOpcode() == ISD::FrameIndex) {
1344 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1345 Ptr = DAG.getAddrSpaceCast(SDLoc(), Ty, Ptr, ADDRESS_SPACE_GENERIC,
1347
1349 }
1350
1351 // Peel of an addrspacecast to generic and load directly from the specific
1352 // address space.
1353 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1354 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1355 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1356 Ptr = ASC->getOperand(0);
1357 return MachinePointerInfo(ASC->getSrcAddressSpace());
1358 }
1359 }
1360
1361 return MachinePointerInfo();
1362}
1363
1365 if (Flags.isSExt())
1366 return ISD::SIGN_EXTEND;
1367 if (Flags.isZExt())
1368 return ISD::ZERO_EXTEND;
1369 return ISD::ANY_EXTEND;
1370}
1371
1373 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1374 SDLoc dl) {
1375 const EVT ActualVT = V.getValueType();
1376 assert((ActualVT == ExpectedVT ||
1377 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1378 "Non-integer argument type size mismatch");
1379 if (ExpectedVT.bitsGT(ActualVT))
1380 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1381 if (ExpectedVT.bitsLT(ActualVT))
1382 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1383
1384 return V;
1385}
1386
1388 SmallVectorImpl<SDValue> &InVals) const {
1389
1390 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1392 "Support for variadic functions (unsized array parameter) introduced "
1393 "in PTX ISA version 6.0 and requires target sm_30.");
1394
1395 SelectionDAG &DAG = CLI.DAG;
1396 SDLoc dl = CLI.DL;
1397 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1398 SDValue Callee = CLI.Callee;
1399 ArgListTy &Args = CLI.getArgs();
1400 Type *RetTy = CLI.RetTy;
1401 const CallBase *CB = CLI.CB;
1402 const DataLayout &DL = DAG.getDataLayout();
1403 LLVMContext &Ctx = *DAG.getContext();
1404
1405 const auto GetI32 = [&](const unsigned I) {
1406 return DAG.getConstant(I, dl, MVT::i32);
1407 };
1408
1409 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1410 const SDValue CallChain = CLI.Chain;
1411 const SDValue StartChain =
1412 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1413 SDValue DeclareGlue = StartChain.getValue(1);
1414
1415 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1416
1417 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1418 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1419 // loaded/stored using i16, so it's handled here as well.
1420 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1421 SDValue Declare =
1422 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1423 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1424 CallPrereqs.push_back(Declare);
1425 DeclareGlue = Declare.getValue(1);
1426 return Declare;
1427 };
1428
1429 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1430 unsigned Size) {
1431 SDValue Declare = DAG.getNode(
1432 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1433 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1434 CallPrereqs.push_back(Declare);
1435 DeclareGlue = Declare.getValue(1);
1436 return Declare;
1437 };
1438
1439 // Variadic arguments.
1440 //
1441 // Normally, for each argument, we declare a param scalar or a param
1442 // byte array in the .param space, and store the argument value to that
1443 // param scalar or array starting at offset 0.
1444 //
1445 // In the case of the first variadic argument, we declare a vararg byte array
1446 // with size 0. The exact size of this array isn't known at this point, so
1447 // it'll be patched later. All the variadic arguments will be stored to this
1448 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1449 // initially set to 0, so it can be used for non-variadic arguments (which use
1450 // 0 offset) to simplify the code.
1451 //
1452 // After all vararg is processed, 'VAOffset' holds the size of the
1453 // vararg byte array.
1454 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1455 "Non-VarArg function with extra arguments");
1456
1457 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1458 unsigned VAOffset = 0; // current offset in the param array
1459
1460 const SDValue VADeclareParam =
1461 CLI.Args.size() > FirstVAArg
1462 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1463 Align(STI.getMaxRequiredAlignment()), 0)
1464 : SDValue();
1465
1466 // Args.size() and Outs.size() need not match.
1467 // Outs.size() will be larger
1468 // * if there is an aggregate argument with multiple fields (each field
1469 // showing up separately in Outs)
1470 // * if there is a vector argument with more than typical vector-length
1471 // elements (generally if more than 4) where each vector element is
1472 // individually present in Outs.
1473 // So a different index should be used for indexing into Outs/OutVals.
1474 // See similar issue in LowerFormalArguments.
1475 auto AllOuts = ArrayRef(CLI.Outs);
1476 auto AllOutVals = ArrayRef(CLI.OutVals);
1477 assert(AllOuts.size() == AllOutVals.size() &&
1478 "Outs and OutVals must be the same size");
1479 // Declare the .params or .reg need to pass values
1480 // to the function
1481 for (const auto E : llvm::enumerate(Args)) {
1482 const auto ArgI = E.index();
1483 const auto Arg = E.value();
1484 const auto ArgOuts =
1485 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1486 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1487 AllOuts = AllOuts.drop_front(ArgOuts.size());
1488 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1489
1490 const bool IsVAArg = (ArgI >= FirstVAArg);
1491 const bool IsByVal = Arg.IsByVal;
1492
1493 const SDValue ParamSymbol =
1494 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1495
1496 assert((!IsByVal || Arg.IndirectType) &&
1497 "byval arg must have indirect type");
1498 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1499
1500 const Align ArgAlign = [&]() {
1501 if (IsByVal) {
1502 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1503 // so we don't need to worry whether it's naturally aligned or not.
1504 // See TargetLowering::LowerCallTo().
1505 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1507 InitialAlign, DL);
1508 }
1509 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1510 }();
1511
1512 const unsigned TySize = DL.getTypeAllocSize(ETy);
1513 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1514 "type size mismatch");
1515
1516 const SDValue ArgDeclare = [&]() {
1517 if (IsVAArg)
1518 return VADeclareParam;
1519
1520 if (IsByVal || shouldPassAsArray(Arg.Ty))
1521 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1522
1523 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1524 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1525 "Only int and float types are supported as non-array arguments");
1526
1527 return MakeDeclareScalarParam(ParamSymbol, TySize);
1528 }();
1529
1530 if (IsByVal) {
1531 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1532 SDValue SrcPtr = ArgOutVals[0];
1533 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1534 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1535
1536 if (IsVAArg)
1537 VAOffset = alignTo(VAOffset, ArgAlign);
1538
1539 SmallVector<EVT, 4> ValueVTs, MemVTs;
1541 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1542
1543 unsigned J = 0;
1544 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1545 for (const unsigned NumElts : VI) {
1546 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1547 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1548 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1549 SDValue SrcLoad =
1550 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1551
1552 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1553 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1554 SDValue ParamAddr =
1555 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1556 SDValue StoreParam =
1557 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1559 CallPrereqs.push_back(StoreParam);
1560
1561 J += NumElts;
1562 }
1563 if (IsVAArg)
1564 VAOffset += TySize;
1565 } else {
1568 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1569 VAOffset);
1570 assert(VTs.size() == Offsets.size() && "Size mismatch");
1571 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1572
1573 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1574 // than 32-bits are sign extended or zero extended, depending on
1575 // whether they are signed or unsigned types. This case applies
1576 // only to scalar parameters and not to aggregate values.
1577 const bool ExtendIntegerParam =
1578 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1579
1580 const auto GetStoredValue = [&](const unsigned I) {
1581 SDValue StVal = ArgOutVals[I];
1583 StVal.getValueType() &&
1584 "OutVal type should always be legal");
1585
1586 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1587 const EVT StoreVT =
1588 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1589
1590 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1591 };
1592
1593 unsigned J = 0;
1594 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1595 for (const unsigned NumElts : VI) {
1596 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1597
1598 unsigned Offset;
1599 if (IsVAArg) {
1600 // TODO: We may need to support vector types that can be passed
1601 // as scalars in variadic arguments.
1602 assert(NumElts == 1 &&
1603 "Vectorization should be disabled for vaargs.");
1604
1605 // Align each part of the variadic argument to their type.
1606 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1607 Offset = VAOffset;
1608
1609 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1610 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1611 } else {
1612 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1613 Offset = Offsets[J];
1614 }
1615
1616 SDValue Ptr =
1617 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1618
1619 const MaybeAlign CurrentAlign = ExtendIntegerParam
1620 ? MaybeAlign(std::nullopt)
1621 : commonAlignment(ArgAlign, Offset);
1622
1623 SDValue Val =
1624 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1625 return GetStoredValue(J + K);
1626 });
1627
1628 SDValue StoreParam =
1629 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1631 CallPrereqs.push_back(StoreParam);
1632
1633 J += NumElts;
1634 }
1635 }
1636 }
1637
1638 // Handle Result
1639 if (!Ins.empty()) {
1640 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1641 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1642 if (shouldPassAsArray(RetTy)) {
1643 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1644 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1645 } else {
1646 MakeDeclareScalarParam(RetSymbol, ResultSize);
1647 }
1648 }
1649
1650 // Set the size of the vararg param byte array if the callee is a variadic
1651 // function and the variadic part is not empty.
1652 if (VADeclareParam) {
1653 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1654 VADeclareParam.getOperand(1),
1655 VADeclareParam.getOperand(2), GetI32(VAOffset),
1656 VADeclareParam.getOperand(4)};
1657 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1658 VADeclareParam->getVTList(), DeclareParamOps);
1659 }
1660
1661 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1662 // If the type of the callsite does not match that of the function, convert
1663 // the callsite to an indirect call.
1664 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1665
1666 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1667 // between them we must rely on the call site value which is valid for
1668 // indirect calls but is always null for libcalls.
1669 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1670
1671 if (isa<ExternalSymbolSDNode>(Callee)) {
1672 Function* CalleeFunc = nullptr;
1673
1674 // Try to find the callee in the current module.
1675 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1676 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1677
1678 // Set the "libcall callee" attribute to indicate that the function
1679 // must always have a declaration.
1680 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1681 }
1682
1683 if (IsIndirectCall) {
1684 // This is indirect function call case : PTX requires a prototype of the
1685 // form
1686 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1687 // to be emitted, and the label has to used as the last arg of call
1688 // instruction.
1689 // The prototype is embedded in a string and put as the operand for a
1690 // CallPrototype SDNode which will print out to the value of the string.
1691 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1692 std::string Proto =
1693 getPrototype(DL, RetTy, Args, CLI.Outs,
1694 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1695 UniqueCallSite);
1696 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1697 const SDValue PrototypeDeclare = DAG.getNode(
1698 NVPTXISD::CallPrototype, dl, MVT::Other,
1699 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1700 CallPrereqs.push_back(PrototypeDeclare);
1701 }
1702
1703 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1704 const unsigned NumArgs =
1705 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1706 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1707 /// NumParams, Callee, Proto)
1708 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1709 const SDValue Call = DAG.getNode(
1710 NVPTXISD::CALL, dl, MVT::Other,
1711 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1712 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1713
1714 SmallVector<SDValue, 16> LoadChains{Call};
1715 SmallVector<SDValue, 16> ProxyRegOps;
1716 if (!Ins.empty()) {
1719 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1720 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1721
1722 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1723 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1724
1725 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1726 // 32-bits are sign extended or zero extended, depending on whether
1727 // they are signed or unsigned types.
1728 const bool ExtendIntegerRetVal =
1729 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1730
1731 unsigned I = 0;
1732 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1733 for (const unsigned NumElts : VI) {
1734 const MaybeAlign CurrentAlign =
1735 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1736 : commonAlignment(RetAlign, Offsets[I]);
1737
1738 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1739 const EVT LoadVT =
1740 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1741 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1742 SDValue Ptr =
1743 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1744
1745 SDValue R =
1746 DAG.getLoad(VecVT, dl, Call, Ptr,
1748
1749 LoadChains.push_back(R.getValue(1));
1750 for (const unsigned J : llvm::seq(NumElts))
1751 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1752 I += NumElts;
1753 }
1754 }
1755
1756 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1757 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1758 UniqueCallSite + 1, SDValue(), dl);
1759
1760 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1761 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1762 // dangling.
1763 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1764 SDValue Proxy =
1765 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1766 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1767 InVals.push_back(Ret);
1768 }
1769
1770 // set IsTailCall to false for now, until we figure out how to express
1771 // tail call optimization in PTX
1772 CLI.IsTailCall = false;
1773 return CallEnd;
1774}
1775
1777 SelectionDAG &DAG) const {
1778
1779 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1780 const Function &Fn = DAG.getMachineFunction().getFunction();
1781
1783 Fn,
1784 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1785 "requires target sm_52.",
1786 SDLoc(Op).getDebugLoc()));
1787 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1788 Op.getOperand(0)};
1789 return DAG.getMergeValues(Ops, SDLoc());
1790 }
1791
1792 SDLoc DL(Op.getNode());
1793 SDValue Chain = Op.getOperand(0);
1794 SDValue Size = Op.getOperand(1);
1795 uint64_t Align = Op.getConstantOperandVal(2);
1796
1797 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1798 // the default stack alignment should be used.
1799 if (Align == 0)
1801
1802 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1803 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1804
1805 SDValue Alloc =
1806 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1807 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1808 DAG.getTargetConstant(Align, DL, MVT::i32)});
1809
1810 SDValue ASC = DAG.getAddrSpaceCast(
1812
1813 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1814}
1815
1817 SelectionDAG &DAG) const {
1818 SDLoc DL(Op.getNode());
1819 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1820 const Function &Fn = DAG.getMachineFunction().getFunction();
1821
1823 Fn,
1824 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1825 ">= sm_52.",
1826 DL.getDebugLoc()));
1827 return Op.getOperand(0);
1828 }
1829
1830 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1831 SDValue Chain = Op.getOperand(0);
1832 SDValue Ptr = Op.getOperand(1);
1833 SDValue ASC = DAG.getAddrSpaceCast(DL, LocalVT, Ptr, ADDRESS_SPACE_GENERIC,
1835 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1836}
1837
1839 SelectionDAG &DAG) const {
1840 SDLoc DL(Op.getNode());
1841 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1842 const Function &Fn = DAG.getMachineFunction().getFunction();
1843
1845 Fn,
1846 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1847 "sm_52.",
1848 DL.getDebugLoc()));
1849 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1850 return DAG.getMergeValues(Ops, DL);
1851 }
1852
1853 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1854 SDValue Chain = Op.getOperand(0);
1855 SDValue SS =
1856 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1857 SDValue ASC = DAG.getAddrSpaceCast(
1858 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1859 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1860}
1861
1862// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1863// (see LegalizeDAG.cpp). This is slow and uses local memory.
1864// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1865SDValue
1866NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1867 SDNode *Node = Op.getNode();
1868 SDLoc dl(Node);
1870 unsigned NumOperands = Node->getNumOperands();
1871 for (unsigned i = 0; i < NumOperands; ++i) {
1872 SDValue SubOp = Node->getOperand(i);
1873 EVT VVT = SubOp.getNode()->getValueType(0);
1874 EVT EltVT = VVT.getVectorElementType();
1875 unsigned NumSubElem = VVT.getVectorNumElements();
1876 for (unsigned j = 0; j < NumSubElem; ++j) {
1877 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1878 DAG.getIntPtrConstant(j, dl)));
1879 }
1880 }
1881 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1882}
1883
1885 SelectionDAG &DAG,
1886 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1887 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1888 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1889 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1890 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1891}
1892
1894 SelectionDAG &DAG,
1895 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1896 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1897}
1898
1899/// Reduces the elements using the scalar operations provided. The operations
1900/// are sorted descending in number of inputs they take. The flags on the
1901/// original reduction operation will be propagated to each scalar operation.
1902/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1903/// used in ExpandReductions and SelectionDAG.
1905 const SmallVector<SDValue> &Elements, EVT EltTy,
1906 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1907 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1908 // Build the reduction tree at each level, starting with all the elements.
1909 SmallVector<SDValue> Level = Elements;
1910
1911 unsigned OpIdx = 0;
1912 while (Level.size() > 1) {
1913 // Try to reduce this level using the current operator.
1914 const auto [Op, NumInputs] = Ops[OpIdx];
1915
1916 // Build the next level by partially reducing all elements.
1917 SmallVector<SDValue> ReducedLevel;
1918 unsigned I = 0, E = Level.size();
1919 for (; I + NumInputs <= E; I += NumInputs) {
1920 // Reduce elements in groups of [NumInputs], as much as possible.
1921 ReducedLevel.push_back(DAG.getNode(
1922 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1923 }
1924
1925 if (I < E) {
1926 // Handle leftover elements.
1927
1928 if (ReducedLevel.empty()) {
1929 // We didn't reduce anything at this level. We need to pick a smaller
1930 // operator.
1931 ++OpIdx;
1932 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1933 continue;
1934 }
1935
1936 // We reduced some things but there's still more left, meaning the
1937 // operator's number of inputs doesn't evenly divide this level size. Move
1938 // these elements to the next level.
1939 for (; I < E; ++I)
1940 ReducedLevel.push_back(Level[I]);
1941 }
1942
1943 // Process the next level.
1944 Level = ReducedLevel;
1945 }
1946
1947 return *Level.begin();
1948}
1949
1950// Get scalar reduction opcode
1951static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
1952 switch (ReductionOpcode) {
1954 return ISD::FMAXNUM;
1956 return ISD::FMINNUM;
1958 return ISD::FMAXIMUM;
1960 return ISD::FMINIMUM;
1961 default:
1962 llvm_unreachable("unhandled reduction opcode");
1963 }
1964}
1965
1966/// Get 3-input scalar reduction opcode
1967static std::optional<unsigned>
1968getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
1969 switch (ReductionOpcode) {
1971 return NVPTXISD::FMAXNUM3;
1973 return NVPTXISD::FMINNUM3;
1975 return NVPTXISD::FMAXIMUM3;
1977 return NVPTXISD::FMINIMUM3;
1978 default:
1979 return std::nullopt;
1980 }
1981}
1982
1983/// Lower reductions to either a sequence of operations or a tree if
1984/// reassociations are allowed. This method will use larger operations like
1985/// max3/min3 when the target supports them.
1986SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
1987 SelectionDAG &DAG) const {
1988 SDLoc DL(Op);
1989 const SDNodeFlags Flags = Op->getFlags();
1990 SDValue Vector = Op.getOperand(0);
1991
1992 const unsigned Opcode = Op->getOpcode();
1993 const EVT EltTy = Vector.getValueType().getVectorElementType();
1994
1995 // Whether we can use 3-input min/max when expanding the reduction.
1996 const bool CanUseMinMax3 =
1997 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
1998 STI.getPTXVersion() >= 88 &&
1999 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2000 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2001
2002 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2003 // number of inputs they take.
2004 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2005
2006 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2007 CanUseMinMax3 && Opcode3Elem)
2008 ScalarOps.push_back({*Opcode3Elem, 3});
2009 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2010
2012 DAG.ExtractVectorElements(Vector, Elements);
2013
2014 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2015}
2016
2017SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2018 // Handle bitcasting from v2i8 without hitting the default promotion
2019 // strategy which goes through stack memory.
2020 EVT FromVT = Op->getOperand(0)->getValueType(0);
2021 if (FromVT != MVT::v2i8) {
2022 return Op;
2023 }
2024
2025 // Pack vector elements into i16 and bitcast to final type
2026 SDLoc DL(Op);
2027 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2028 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2029 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2030 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2031 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2032 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2033 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2034 SDValue AsInt = DAG.getNode(
2035 ISD::OR, DL, MVT::i16,
2036 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2037 EVT ToVT = Op->getValueType(0);
2038 return DAG.getBitcast(ToVT, AsInt);
2039}
2040
2041// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2042// would get lowered as two constant loads and vector-packing move.
2043// Instead we want just a constant move:
2044// mov.b32 %r2, 0x40003C00
2045SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2046 SelectionDAG &DAG) const {
2047 EVT VT = Op->getValueType(0);
2048 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2049 return Op;
2050 SDLoc DL(Op);
2051
2052 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2053 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2054 isa<ConstantFPSDNode>(Operand);
2055 })) {
2056 if (VT != MVT::v4i8)
2057 return Op;
2058 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2059 // to optimize calculation of constant parts.
2060 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2061 uint64_t SelectionValue) -> SDValue {
2062 SDValue L = Left;
2063 SDValue R = Right;
2064 if (Cast) {
2065 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2066 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2067 }
2068 return getPRMT(L, R, SelectionValue, DL, DAG);
2069 };
2070 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2071 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2072 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2073 return DAG.getBitcast(VT, PRMT3210);
2074 }
2075
2076 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2077 auto GetOperand = [](SDValue Op, int N) -> APInt {
2078 const SDValue &Operand = Op->getOperand(N);
2079 EVT VT = Op->getValueType(0);
2080 if (Operand->isUndef())
2081 return APInt(32, 0);
2082 APInt Value;
2083 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2084 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2085 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2086 Value = Operand->getAsAPIntVal();
2087 else
2088 llvm_unreachable("Unsupported type");
2089 // i8 values are carried around as i16, so we need to zero out upper bits,
2090 // so they do not get in the way of combining individual byte values
2091 if (VT == MVT::v4i8)
2092 Value = Value.trunc(8);
2093 return Value.zext(32);
2094 };
2095
2096 // Construct a 32-bit constant by shifting into place smaller values
2097 // (elements of the vector type VT).
2098 // For example, if VT has 2 elements, then N == 2:
2099 // ShiftAmount = 32 / N = 16
2100 // Value |= Op0 (b16) << 0
2101 // Value |= Op1 (b16) << 16
2102 // If N == 4:
2103 // ShiftAmount = 32 / N = 8
2104 // Value |= Op0 (b8) << 0
2105 // Value |= Op1 (b8) << 8
2106 // Value |= Op2 (b8) << 16
2107 // Value |= Op3 (b8) << 24
2108 // ...etc
2109 APInt Value(32, 0);
2110 const unsigned NumElements = VT.getVectorNumElements();
2111 assert(32 % NumElements == 0 && "must evenly divide bit length");
2112 const unsigned ShiftAmount = 32 / NumElements;
2113 for (unsigned ElementNo : seq(NumElements))
2114 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2115 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2116 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2117}
2118
2119SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2120 SelectionDAG &DAG) const {
2121 SDValue Index = Op->getOperand(1);
2122 SDValue Vector = Op->getOperand(0);
2123 SDLoc DL(Op);
2124 EVT VectorVT = Vector.getValueType();
2125
2126 if (VectorVT == MVT::v4i8) {
2127 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2128 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2129 DAG.getConstant(0x7770, DL, MVT::i32));
2130 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2131 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2132 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2133 SDNodeFlags Flags;
2134 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2135 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2136 Ext->setFlags(Flags);
2137 return Ext;
2138 }
2139
2140 // Constant index will be matched by tablegen.
2141 if (isa<ConstantSDNode>(Index.getNode()))
2142 return Op;
2143
2144 // Extract individual elements and select one of them.
2145 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2146 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2147 EVT EltVT = VectorVT.getVectorElementType();
2148
2149 SDLoc dl(Op.getNode());
2150 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2151 DAG.getIntPtrConstant(0, dl));
2152 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2153 DAG.getIntPtrConstant(1, dl));
2154 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2156}
2157
2158SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2159 SelectionDAG &DAG) const {
2160 SDValue Vector = Op->getOperand(0);
2161 EVT VectorVT = Vector.getValueType();
2162
2163 if (VectorVT != MVT::v4i8)
2164 return Op;
2165 SDLoc DL(Op);
2166 SDValue Value = Op->getOperand(1);
2167 if (Value->isUndef())
2168 return Vector;
2169
2170 SDValue Index = Op->getOperand(2);
2171
2172 SDValue BFI =
2173 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2174 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2175 DAG.getNode(ISD::MUL, DL, MVT::i32,
2176 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2177 DAG.getConstant(8, DL, MVT::i32)),
2178 DAG.getConstant(8, DL, MVT::i32)});
2179 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2180}
2181
2182SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2183 SelectionDAG &DAG) const {
2184 SDValue V1 = Op.getOperand(0);
2185 EVT VectorVT = V1.getValueType();
2186 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2187 return Op;
2188
2189 // Lower shuffle to PRMT instruction.
2190 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2191 SDValue V2 = Op.getOperand(1);
2192 uint32_t Selector = 0;
2193 for (auto I : llvm::enumerate(SVN->getMask())) {
2194 if (I.value() != -1) // -1 is a placeholder for undef.
2195 Selector |= (I.value() << (I.index() * 4));
2196 }
2197
2198 SDLoc DL(Op);
2199 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2200 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2201 return DAG.getBitcast(Op.getValueType(), PRMT);
2202}
2203/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2204/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2205/// amount, or
2206/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2207/// amount.
2208SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2209 SelectionDAG &DAG) const {
2210 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2211 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2212
2213 EVT VT = Op.getValueType();
2214 unsigned VTBits = VT.getSizeInBits();
2215 SDLoc dl(Op);
2216 SDValue ShOpLo = Op.getOperand(0);
2217 SDValue ShOpHi = Op.getOperand(1);
2218 SDValue ShAmt = Op.getOperand(2);
2219 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2220
2221 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2222 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2223 // {dHi, dLo} = {aHi, aLo} >> Amt
2224 // dHi = aHi >> Amt
2225 // dLo = shf.r.clamp aLo, aHi, Amt
2226
2227 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2228 SDValue Lo =
2229 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2230
2231 SDValue Ops[2] = { Lo, Hi };
2232 return DAG.getMergeValues(Ops, dl);
2233 }
2234 else {
2235 // {dHi, dLo} = {aHi, aLo} >> Amt
2236 // - if (Amt>=size) then
2237 // dLo = aHi >> (Amt-size)
2238 // dHi = aHi >> Amt (this is either all 0 or all 1)
2239 // else
2240 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2241 // dHi = aHi >> Amt
2242
2243 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2244 DAG.getConstant(VTBits, dl, MVT::i32),
2245 ShAmt);
2246 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2247 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2248 DAG.getConstant(VTBits, dl, MVT::i32));
2249 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2250 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2251 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2252
2253 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2254 DAG.getConstant(VTBits, dl, MVT::i32),
2255 ISD::SETGE);
2256 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2257 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2258
2259 SDValue Ops[2] = { Lo, Hi };
2260 return DAG.getMergeValues(Ops, dl);
2261 }
2262}
2263
2264/// LowerShiftLeftParts - Lower SHL_PARTS, which
2265/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2266/// amount, or
2267/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2268/// amount.
2269SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2270 SelectionDAG &DAG) const {
2271 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2272 assert(Op.getOpcode() == ISD::SHL_PARTS);
2273
2274 EVT VT = Op.getValueType();
2275 unsigned VTBits = VT.getSizeInBits();
2276 SDLoc dl(Op);
2277 SDValue ShOpLo = Op.getOperand(0);
2278 SDValue ShOpHi = Op.getOperand(1);
2279 SDValue ShAmt = Op.getOperand(2);
2280
2281 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2282 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2283 // {dHi, dLo} = {aHi, aLo} << Amt
2284 // dHi = shf.l.clamp aLo, aHi, Amt
2285 // dLo = aLo << Amt
2286
2287 SDValue Hi =
2288 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2289 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2290
2291 SDValue Ops[2] = { Lo, Hi };
2292 return DAG.getMergeValues(Ops, dl);
2293 }
2294 else {
2295 // {dHi, dLo} = {aHi, aLo} << Amt
2296 // - if (Amt>=size) then
2297 // dLo = aLo << Amt (all 0)
2298 // dLo = aLo << (Amt-size)
2299 // else
2300 // dLo = aLo << Amt
2301 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2302
2303 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2304 DAG.getConstant(VTBits, dl, MVT::i32),
2305 ShAmt);
2306 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2307 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2308 DAG.getConstant(VTBits, dl, MVT::i32));
2309 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2310 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2311 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2312
2313 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2314 DAG.getConstant(VTBits, dl, MVT::i32),
2315 ISD::SETGE);
2316 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2317 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2318
2319 SDValue Ops[2] = { Lo, Hi };
2320 return DAG.getMergeValues(Ops, dl);
2321 }
2322}
2323
2324/// If the types match, convert the generic copysign to the NVPTXISD version,
2325/// otherwise bail ensuring that mismatched cases are properly expaned.
2326SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2327 SelectionDAG &DAG) const {
2328 EVT VT = Op.getValueType();
2329 SDLoc DL(Op);
2330
2331 SDValue In1 = Op.getOperand(0);
2332 SDValue In2 = Op.getOperand(1);
2333 EVT SrcVT = In2.getValueType();
2334
2335 if (!SrcVT.bitsEq(VT))
2336 return SDValue();
2337
2338 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2339}
2340
2341SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2342 EVT VT = Op.getValueType();
2343
2344 if (VT == MVT::f32)
2345 return LowerFROUND32(Op, DAG);
2346
2347 if (VT == MVT::f64)
2348 return LowerFROUND64(Op, DAG);
2349
2350 llvm_unreachable("unhandled type");
2351}
2352
2353// This is the the rounding method used in CUDA libdevice in C like code:
2354// float roundf(float A)
2355// {
2356// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2357// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2358// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2359// }
2360SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2361 SelectionDAG &DAG) const {
2362 SDLoc SL(Op);
2363 SDValue A = Op.getOperand(0);
2364 EVT VT = Op.getValueType();
2365
2366 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2367
2368 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2369 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2370 const unsigned SignBitMask = 0x80000000;
2371 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2372 DAG.getConstant(SignBitMask, SL, MVT::i32));
2373 const unsigned PointFiveInBits = 0x3F000000;
2374 SDValue PointFiveWithSignRaw =
2375 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2376 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2377 SDValue PointFiveWithSign =
2378 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2379 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2380 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2381
2382 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2383 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2384 SDValue IsLarge =
2385 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2386 ISD::SETOGT);
2387 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2388
2389 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2390 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2391 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2392 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2393 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2394}
2395
2396// The implementation of round(double) is similar to that of round(float) in
2397// that they both separate the value range into three regions and use a method
2398// specific to the region to round the values. However, round(double) first
2399// calculates the round of the absolute value and then adds the sign back while
2400// round(float) directly rounds the value with sign.
2401SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2402 SelectionDAG &DAG) const {
2403 SDLoc SL(Op);
2404 SDValue A = Op.getOperand(0);
2405 EVT VT = Op.getValueType();
2406
2407 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2408
2409 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2410 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2411 DAG.getConstantFP(0.5, SL, VT));
2412 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2413
2414 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2415 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2416 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2417 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2418 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2419 DAG.getConstantFP(0, SL, VT),
2420 RoundedA);
2421
2422 // Add sign to rounded_A
2423 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2424 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2425
2426 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2427 SDValue IsLarge =
2428 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2429 ISD::SETOGT);
2430 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2431}
2432
2434 EVT VT = N->getValueType(0);
2435 EVT NVT = MVT::f32;
2436 if (VT.isVector()) {
2437 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2438 }
2439 SDLoc DL(N);
2440 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2441 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2442 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2443 return DAG.getFPExtendOrRound(Res, DL, VT);
2444}
2445
2446SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2447 SelectionDAG &DAG) const {
2448 if (useF32FTZ(DAG.getMachineFunction())) {
2449 return PromoteBinOpToF32(Op.getNode(), DAG);
2450 }
2451 return Op;
2452}
2453
2454SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2455 SelectionDAG &DAG) const {
2456 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2457
2458 if (Op.getValueType() == MVT::bf16) {
2459 SDLoc Loc(Op);
2460 return DAG.getNode(
2461 ISD::FP_ROUND, Loc, MVT::bf16,
2462 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2463 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2464 }
2465
2466 // Everything else is considered legal.
2467 return Op;
2468}
2469
2470SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2471 SelectionDAG &DAG) const {
2472 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2473
2474 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2475 SDLoc Loc(Op);
2476 return DAG.getNode(
2477 Op.getOpcode(), Loc, Op.getValueType(),
2478 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2479 }
2480
2481 // Everything else is considered legal.
2482 return Op;
2483}
2484
2485SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2486 SelectionDAG &DAG) const {
2487 EVT NarrowVT = Op.getValueType();
2488 SDValue Wide = Op.getOperand(0);
2489 EVT WideVT = Wide.getValueType();
2490 if (NarrowVT.getScalarType() == MVT::bf16) {
2491 const TargetLowering *TLI = STI.getTargetLowering();
2492 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2493 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2494 }
2495 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2496 // This combination was the first to support f32 -> bf16.
2497 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2498 if (WideVT.getScalarType() == MVT::f32) {
2499 return Op;
2500 }
2501 if (WideVT.getScalarType() == MVT::f64) {
2502 SDLoc Loc(Op);
2503 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2504 // the hardware f32 -> bf16 instruction.
2506 WideVT.changeElementType(*DAG.getContext(), MVT::f32), Wide, Loc,
2507 DAG);
2508 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2509 }
2510 }
2511 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2512 }
2513 }
2514
2515 // Everything else is considered legal.
2516 return Op;
2517}
2518
2519SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2520 SelectionDAG &DAG) const {
2521 SDValue Narrow = Op.getOperand(0);
2522 EVT NarrowVT = Narrow.getValueType();
2523 EVT WideVT = Op.getValueType();
2524 if (NarrowVT.getScalarType() == MVT::bf16) {
2525 if (WideVT.getScalarType() == MVT::f32 &&
2526 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2527 SDLoc Loc(Op);
2528 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2529 }
2530 if (WideVT.getScalarType() == MVT::f64 &&
2531 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2532 EVT F32 = NarrowVT.changeElementType(*DAG.getContext(), MVT::f32);
2533 SDLoc Loc(Op);
2534 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2535 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2536 } else {
2537 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2538 }
2539 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2540 }
2541 }
2542
2543 // Everything else is considered legal.
2544 return Op;
2545}
2546
2548 SDLoc DL(Op);
2549 if (Op.getValueType() != MVT::v2i16)
2550 return Op;
2551 EVT EltVT = Op.getValueType().getVectorElementType();
2552 SmallVector<SDValue> VecElements;
2553 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2554 SmallVector<SDValue> ScalarArgs;
2555 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2556 [&](const SDUse &O) {
2557 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2558 O.get(), DAG.getIntPtrConstant(I, DL));
2559 });
2560 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2561 }
2562 SDValue V =
2563 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2564 return V;
2565}
2566
2568 SDNode *N = Op.getNode();
2569 SDLoc DL(N);
2571
2572 // split the vector argument
2573 for (size_t I = 0; I < N->getNumOperands(); I++) {
2574 SDValue Val = N->getOperand(I);
2575 EVT ValVT = Val.getValueType();
2576 if (ValVT.isVector()) {
2577 EVT EltVT = ValVT.getVectorElementType();
2578 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2579 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2580 DAG.getIntPtrConstant(J, DL)));
2581 } else
2582 Ops.push_back(Val);
2583 }
2584
2586 SDValue Tcgen05StNode =
2587 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2588 MemSD->getMemoryVT(), MemSD->getMemOperand());
2589
2590 return Tcgen05StNode;
2591}
2592
2594 SDLoc DL(Op);
2595 SDValue Src = Op.getOperand(0);
2596 EVT VT = Op.getValueType();
2597
2598 switch (VT.getSimpleVT().SimpleTy) {
2599 case MVT::i16: {
2600 SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
2601 SDValue Swapped =
2602 getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
2603 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
2604 }
2605 case MVT::i32: {
2606 return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
2607 }
2608 case MVT::v2i16: {
2609 SDValue Converted = DAG.getBitcast(MVT::i32, Src);
2610 SDValue Swapped =
2611 getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
2612 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
2613 }
2614 case MVT::i64: {
2615 SDValue UnpackSrc =
2616 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
2617 SDValue SwappedLow =
2618 getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2619 DL, DAG);
2620 SDValue SwappedHigh =
2621 getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2622 DL, DAG);
2623 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
2624 {SwappedHigh, SwappedLow});
2625 }
2626 default:
2627 llvm_unreachable("unsupported type for bswap");
2628 }
2629}
2630
2631static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2632 switch (IID) {
2633 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2634 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1;
2635 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2636 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2;
2637 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2638 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2639 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2640 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2641 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2642 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2643 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2644 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2645 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2646 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2647 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2648 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2649 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2650 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2651 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2652 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2653 case Intrinsic::
2654 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2655 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2656 case Intrinsic::
2657 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2658 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2659 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2660 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1;
2661 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2662 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2;
2663 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2664 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2665 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2666 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2667 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2668 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2669 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2670 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2671 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2672 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2673 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2674 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2675 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2676 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2677 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2678 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2679 case Intrinsic::
2680 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2681 return NVPTXISD::
2682 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2683 case Intrinsic::
2684 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2685 return NVPTXISD::
2686 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2687 };
2688 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2689}
2690
2692 SDNode *N = Op.getNode();
2693 SDLoc DL(N);
2694 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2695
2697 // split the vector argument
2698 for (size_t I = 0; I < N->getNumOperands(); I++) {
2699 if (I == 1)
2700 continue; // skip IID
2701 SDValue Val = N->getOperand(I);
2702 EVT ValVT = Val.getValueType();
2703 if (ValVT.isVector()) {
2704 EVT EltVT = ValVT.getVectorElementType();
2705 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2706 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2707 DAG.getIntPtrConstant(J, DL)));
2708 } else
2709 Ops.push_back(Val);
2710 }
2711
2713 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2714 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2715 MemSD->getMemoryVT(), MemSD->getMemOperand());
2716
2717 return Tcgen05MMANode;
2718}
2719
2720// Lower vector return type of tcgen05.ld intrinsics
2721static std::optional<std::pair<SDValue, SDValue>>
2722lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2723 SDLoc DL(N);
2724 EVT ResVT = N->getValueType(0);
2725 if (!ResVT.isVector())
2726 return {}; // already legalized.
2727
2728 const unsigned NumElts = ResVT.getVectorNumElements();
2729
2730 // Create the return type of the instructions
2731 SmallVector<EVT, 5> ListVTs;
2732 for (unsigned i = 0; i < NumElts; ++i)
2733 ListVTs.push_back(MVT::i32);
2734
2735 ListVTs.push_back(N->getValueType(1)); // Chain
2736
2737 SDVTList ResVTs = DAG.getVTList(ListVTs);
2738
2739 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2740 N->getOperand(2)};
2741
2742 if (HasOffset) {
2743 Ops.push_back(N->getOperand(3)); // offset
2744 Ops.push_back(N->getOperand(4)); // Pack flag
2745 } else
2746 Ops.push_back(N->getOperand(3)); // Pack flag
2747
2749 SDValue NewNode =
2751 MemSD->getMemoryVT(), MemSD->getMemOperand());
2752
2753 // split the vector result
2754 SmallVector<SDValue, 4> ScalarRes;
2755 for (unsigned i = 0; i < NumElts; ++i) {
2756 SDValue Res = NewNode.getValue(i);
2757 ScalarRes.push_back(Res);
2758 }
2759
2760 SDValue Chain = NewNode.getValue(NumElts);
2761 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2762 return {{BuildVector, Chain}};
2763}
2764
2766 unsigned Val) {
2767 SDNode *N = Op.getNode();
2768 SDLoc DL(N);
2769
2770 const Function &Fn = DAG.getMachineFunction().getFunction();
2771
2772 unsigned AS = 0;
2773 if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N))
2774 AS = MemN->getAddressSpace();
2775 Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
2777
2779 Fn,
2780 "Intrinsic " +
2781 Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
2782 " with value " + Twine(Val) +
2783 " is not supported on the given target.",
2784 DL.getDebugLoc()));
2785 return Op.getOperand(0);
2786}
2787
2789 SDNode *N = Op.getNode();
2790 SDLoc DL(N);
2791
2792 // immediate argument representing elemtype
2793 unsigned Val = N->getConstantOperandVal(3);
2794
2796 Val))
2797 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2798
2799 return Op;
2800}
2801
2803 SDNode *N = Op.getNode();
2804 SDLoc DL(N);
2805
2806 // immediate argument representing swizzle mode
2807 unsigned Val = N->getConstantOperandVal(3);
2808
2810 Val))
2811 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2812
2813 return Op;
2814}
2815
2817 SDNode *N = Op.getNode();
2818 SDValue Intrin = N->getOperand(1);
2819
2820 // Get the intrinsic ID
2821 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2822 switch (IntrinNo) {
2823 default:
2824 break;
2825 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2826 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2827 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2828 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2829 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2830 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2831 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2832 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2833 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2834 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2835 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2836 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2837 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2838 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2839 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2840 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2841 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2842 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2843 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2844 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2845 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2846 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2847 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2848 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2849 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2850 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2851 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2852 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2853 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2854 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2855 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2856 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2857 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2858 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2859 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2860 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2861 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2862 return lowerTcgen05St(Op, DAG);
2863 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2864 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2865 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2866 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2867 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2868 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2869 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2870 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2871 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2872 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2873 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2874 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2875 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2876 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2877 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2878 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2879 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2880 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2881 case Intrinsic::
2882 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2883 case Intrinsic::
2884 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2885 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2886 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2887 case Intrinsic::
2888 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2889 case Intrinsic::
2890 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2892 case Intrinsic::nvvm_tensormap_replace_elemtype:
2893 return lowerTensormapReplaceElemtype(Op, DAG);
2894 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
2896 }
2897 return Op;
2898}
2899
2901 SelectionDAG &DAG) {
2902
2903 SDNode *N = Op.getNode();
2904 if (N->getOperand(1).getValueType() != MVT::i128) {
2905 // return, if the operand is already lowered
2906 return SDValue();
2907 }
2908
2909 unsigned IID =
2910 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2911 auto Opcode = [&]() {
2912 switch (IID) {
2913 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2914 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED;
2915 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2916 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X;
2917 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2918 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y;
2919 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2920 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z;
2921 default:
2922 llvm_unreachable("unsupported/unhandled intrinsic");
2923 }
2924 }();
2925
2926 SDLoc DL(N);
2927 SDValue TryCancelResponse = N->getOperand(1);
2928 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2929 SDValue TryCancelResponse0 =
2930 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2931 DAG.getIntPtrConstant(0, DL));
2932 SDValue TryCancelResponse1 =
2933 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2934 DAG.getIntPtrConstant(1, DL));
2935
2936 return DAG.getNode(Opcode, DL, N->getVTList(),
2937 {TryCancelResponse0, TryCancelResponse1});
2938}
2939
2941 SDNode *N = Op.getNode();
2942 SDLoc DL(N);
2943 SDValue F32Vec = N->getOperand(1);
2944 SDValue RBits = N->getOperand(2);
2945
2946 unsigned IntrinsicID = N->getConstantOperandVal(0);
2947
2948 // Extract the 4 float elements from the vector
2950 for (unsigned i = 0; i < 4; ++i)
2951 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
2952 DAG.getIntPtrConstant(i, DL)));
2953
2955
2956 auto [OpCode, RetTy, CvtModeFlag] =
2957 [&]() -> std::tuple<unsigned, MVT::SimpleValueType, uint32_t> {
2958 switch (IntrinsicID) {
2959 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
2960 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
2961 CvtMode::RS | CvtMode::RELU_FLAG};
2962 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
2963 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2964 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
2965 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
2966 CvtMode::RS | CvtMode::RELU_FLAG};
2967 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
2968 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2969 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
2970 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
2971 CvtMode::RS | CvtMode::RELU_FLAG};
2972 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
2973 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2974 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
2975 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
2976 CvtMode::RS | CvtMode::RELU_FLAG};
2977 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
2978 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2979 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
2980 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
2981 CvtMode::RS | CvtMode::RELU_FLAG};
2982 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
2983 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
2984 default:
2985 llvm_unreachable("unsupported/unhandled intrinsic");
2986 }
2987 }();
2988
2989 Ops.push_back(RBits);
2990 Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
2991
2992 return DAG.getNode(OpCode, DL, RetTy, Ops);
2993}
2994
2996 const unsigned Mode = [&]() {
2997 switch (Op->getConstantOperandVal(0)) {
2998 case Intrinsic::nvvm_prmt:
3000 case Intrinsic::nvvm_prmt_b4e:
3002 case Intrinsic::nvvm_prmt_ecl:
3004 case Intrinsic::nvvm_prmt_ecr:
3006 case Intrinsic::nvvm_prmt_f4e:
3008 case Intrinsic::nvvm_prmt_rc16:
3010 case Intrinsic::nvvm_prmt_rc8:
3012 default:
3013 llvm_unreachable("unsupported/unhandled intrinsic");
3014 }
3015 }();
3016 SDLoc DL(Op);
3017 SDValue A = Op->getOperand(1);
3018 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
3019 : DAG.getConstant(0, DL, MVT::i32);
3020 SDValue Selector = (Op->op_end() - 1)->get();
3021 return getPRMT(A, B, Selector, DL, DAG, Mode);
3022}
3023
3024#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE) \
3025 Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_x##NUM##_##TYPE
3026
3027#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE) \
3028 NVPTXISD::TCGEN05_LD_RED_##SHAPE##_X##NUM##_##TYPE
3029
3030static unsigned getTcgen05LdRedID(Intrinsic::ID IID) {
3031 switch (IID) {
3032 case TCGEN05_LD_RED_INTR(32x32b, 2, f32):
3033 return TCGEN05_LD_RED_INST(32x32b, 2, F32);
3034 case TCGEN05_LD_RED_INTR(32x32b, 4, f32):
3035 return TCGEN05_LD_RED_INST(32x32b, 4, F32);
3036 case TCGEN05_LD_RED_INTR(32x32b, 8, f32):
3037 return TCGEN05_LD_RED_INST(32x32b, 8, F32);
3038 case TCGEN05_LD_RED_INTR(32x32b, 16, f32):
3039 return TCGEN05_LD_RED_INST(32x32b, 16, F32);
3040 case TCGEN05_LD_RED_INTR(32x32b, 32, f32):
3041 return TCGEN05_LD_RED_INST(32x32b, 32, F32);
3042 case TCGEN05_LD_RED_INTR(32x32b, 64, f32):
3043 return TCGEN05_LD_RED_INST(32x32b, 64, F32);
3044 case TCGEN05_LD_RED_INTR(32x32b, 128, f32):
3045 return TCGEN05_LD_RED_INST(32x32b, 128, F32);
3046 case TCGEN05_LD_RED_INTR(16x32bx2, 2, f32):
3047 return TCGEN05_LD_RED_INST(16x32bx2, 2, F32);
3048 case TCGEN05_LD_RED_INTR(16x32bx2, 4, f32):
3049 return TCGEN05_LD_RED_INST(16x32bx2, 4, F32);
3050 case TCGEN05_LD_RED_INTR(16x32bx2, 8, f32):
3051 return TCGEN05_LD_RED_INST(16x32bx2, 8, F32);
3052 case TCGEN05_LD_RED_INTR(16x32bx2, 16, f32):
3053 return TCGEN05_LD_RED_INST(16x32bx2, 16, F32);
3054 case TCGEN05_LD_RED_INTR(16x32bx2, 32, f32):
3055 return TCGEN05_LD_RED_INST(16x32bx2, 32, F32);
3056 case TCGEN05_LD_RED_INTR(16x32bx2, 64, f32):
3057 return TCGEN05_LD_RED_INST(16x32bx2, 64, F32);
3058 case TCGEN05_LD_RED_INTR(16x32bx2, 128, f32):
3059 return TCGEN05_LD_RED_INST(16x32bx2, 128, F32);
3060 case TCGEN05_LD_RED_INTR(32x32b, 2, i32):
3061 return TCGEN05_LD_RED_INST(32x32b, 2, I32);
3062 case TCGEN05_LD_RED_INTR(32x32b, 4, i32):
3063 return TCGEN05_LD_RED_INST(32x32b, 4, I32);
3064 case TCGEN05_LD_RED_INTR(32x32b, 8, i32):
3065 return TCGEN05_LD_RED_INST(32x32b, 8, I32);
3066 case TCGEN05_LD_RED_INTR(32x32b, 16, i32):
3067 return TCGEN05_LD_RED_INST(32x32b, 16, I32);
3068 case TCGEN05_LD_RED_INTR(32x32b, 32, i32):
3069 return TCGEN05_LD_RED_INST(32x32b, 32, I32);
3070 case TCGEN05_LD_RED_INTR(32x32b, 64, i32):
3071 return TCGEN05_LD_RED_INST(32x32b, 64, I32);
3072 case TCGEN05_LD_RED_INTR(32x32b, 128, i32):
3073 return TCGEN05_LD_RED_INST(32x32b, 128, I32);
3074 case TCGEN05_LD_RED_INTR(16x32bx2, 2, i32):
3075 return TCGEN05_LD_RED_INST(16x32bx2, 2, I32);
3076 case TCGEN05_LD_RED_INTR(16x32bx2, 4, i32):
3077 return TCGEN05_LD_RED_INST(16x32bx2, 4, I32);
3078 case TCGEN05_LD_RED_INTR(16x32bx2, 8, i32):
3079 return TCGEN05_LD_RED_INST(16x32bx2, 8, I32);
3080 case TCGEN05_LD_RED_INTR(16x32bx2, 16, i32):
3081 return TCGEN05_LD_RED_INST(16x32bx2, 16, I32);
3082 case TCGEN05_LD_RED_INTR(16x32bx2, 32, i32):
3083 return TCGEN05_LD_RED_INST(16x32bx2, 32, I32);
3084 case TCGEN05_LD_RED_INTR(16x32bx2, 64, i32):
3085 return TCGEN05_LD_RED_INST(16x32bx2, 64, I32);
3086 case TCGEN05_LD_RED_INTR(16x32bx2, 128, i32):
3087 return TCGEN05_LD_RED_INST(16x32bx2, 128, I32);
3088 default:
3089 llvm_unreachable("Invalid tcgen05.ld.red intrinsic ID");
3090 }
3091}
3092
3093// Lower vector return type of tcgen05.ld intrinsics
3094static std::optional<std::tuple<SDValue, SDValue, SDValue>>
3096 SDLoc DL(N);
3097 EVT ResVT = N->getValueType(0);
3098 if (!ResVT.isVector())
3099 return {}; // already legalized.
3100
3101 const unsigned NumElts = ResVT.getVectorNumElements();
3102
3103 // Create the return type of the instructions
3104 // +1 represents the reduction value
3105 SmallVector<EVT, 132> ListVTs{
3106 NumElts + 1,
3107 ResVT.getVectorElementType().isFloatingPoint() ? MVT::f32 : MVT::i32};
3108
3109 ListVTs.push_back(MVT::Other); // Chain
3110
3111 SDVTList ResVTs = DAG.getVTList(ListVTs);
3112
3113 // Prepare the Operands
3114 SmallVector<SDValue, 8> Ops{N->getOperand(0)}; // Chain
3115
3116 // skip IID at index 1
3117 for (unsigned i = 2; i < N->getNumOperands(); i++)
3118 Ops.push_back(N->getOperand(i));
3119
3120 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
3122 SDValue NewNode =
3123 DAG.getMemIntrinsicNode(getTcgen05LdRedID(IID), DL, ResVTs, Ops,
3124 MemSD->getMemoryVT(), MemSD->getMemOperand());
3125
3126 // Split vector result
3127 SmallVector<SDValue, 132> ScalarRes;
3128 for (unsigned i = 0; i < NumElts; ++i) {
3129 SDValue Res = NewNode.getValue(i);
3130 ScalarRes.push_back(Res);
3131 }
3132
3133 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
3134 SDValue RedResult = NewNode.getValue(NumElts);
3135 SDValue Chain = NewNode.getValue(NumElts + 1);
3136 return {{BuildVector, RedResult, Chain}};
3137}
3138
3140 switch (Op->getConstantOperandVal(1)) {
3141 default:
3142 return Op;
3143
3144 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
3145 // lower them through LowerOperation() instead of ReplaceNodeResults().
3146 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
3147 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
3148 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
3149 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
3150 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3151 return SDValue();
3152
3153 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
3154 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
3155 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3156 return SDValue();
3157
3158 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
3159 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
3160 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32:
3161 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32:
3162 if (auto Res = lowerTcgen05LdRed(Op.getNode(), DAG))
3163 return DAG.getMergeValues(
3164 {std::get<0>(*Res), std::get<1>(*Res), std::get<2>(*Res)}, SDLoc(Op));
3165 return SDValue();
3166 }
3167}
3168
3170 switch (Op->getConstantOperandVal(0)) {
3171 default:
3172 return Op;
3173 case Intrinsic::nvvm_prmt:
3174 case Intrinsic::nvvm_prmt_b4e:
3175 case Intrinsic::nvvm_prmt_ecl:
3176 case Intrinsic::nvvm_prmt_ecr:
3177 case Intrinsic::nvvm_prmt_f4e:
3178 case Intrinsic::nvvm_prmt_rc16:
3179 case Intrinsic::nvvm_prmt_rc8:
3180 return lowerPrmtIntrinsic(Op, DAG);
3181 case Intrinsic::nvvm_internal_addrspace_wrap:
3182 return Op.getOperand(1);
3183 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
3184 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
3185 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
3186 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
3188 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
3189 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
3190 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
3191 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
3192 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
3193 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
3194 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
3195 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
3196 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
3197 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
3198 return lowerCvtRSIntrinsics(Op, DAG);
3199 }
3200}
3201
3202// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
3203// Lower these into a node returning the correct type which is zero-extended
3204// back to the correct size.
3206 SDValue V = Op->getOperand(0);
3207 assert(V.getValueType() == MVT::i64 &&
3208 "Unexpected CTLZ/CTPOP type to legalize");
3209
3210 SDLoc DL(Op);
3211 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
3212 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
3213}
3214
3216 unsigned Opcode, SelectionDAG &DAG) {
3217 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
3218
3219 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
3220 if (!AmtConst)
3221 return SDValue();
3222 const auto Amt = AmtConst->getZExtValue() & 63;
3223
3224 SDValue UnpackA =
3225 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
3226 SDValue UnpackB =
3227 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
3228
3229 // Arch is Little endiain: 0 = low bits, 1 = high bits
3230 SDValue ALo = UnpackA.getValue(0);
3231 SDValue AHi = UnpackA.getValue(1);
3232 SDValue BLo = UnpackB.getValue(0);
3233 SDValue BHi = UnpackB.getValue(1);
3234
3235 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3236 //
3237 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3238 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3239 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3240 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3241 //
3242 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3243 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3244 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3245 // move to select and arrange the 32bit values. For simplicity, these cases
3246 // are not handled here explicitly and instead we rely on DAGCombiner to
3247 // remove the no-op funnel shifts we insert.
3248 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3249 ? std::make_tuple(AHi, ALo, BHi)
3250 : std::make_tuple(ALo, BHi, BLo);
3251
3252 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3253 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3254 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3255
3256 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3257}
3258
3260 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3261 SDLoc(Op), Op->getOpcode(), DAG);
3262}
3263
3265 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3266 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3267 SDLoc(Op), Opcode, DAG);
3268}
3269
3271 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3272 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3273 // the semantics of LLVM's frem.
3274 SDLoc DL(Op);
3275 SDValue X = Op->getOperand(0);
3276 SDValue Y = Op->getOperand(1);
3277 EVT Ty = Op.getValueType();
3278 SDNodeFlags Flags = Op->getFlags();
3279
3280 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3281 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3282 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3284 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3286
3287 if (Flags.hasNoInfs())
3288 return Sub;
3289
3290 // If Y is infinite, return X
3291 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3292 SDValue Inf =
3293 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3294 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3295 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3296}
3297
3299 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3300
3301 SDValue Cond = Op->getOperand(0);
3302 SDValue TrueVal = Op->getOperand(1);
3303 SDValue FalseVal = Op->getOperand(2);
3304 SDLoc DL(Op);
3305
3306 // If both operands are truncated, we push the select through the truncates.
3307 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3308 FalseVal.getOpcode() == ISD::TRUNCATE) {
3309 TrueVal = TrueVal.getOperand(0);
3310 FalseVal = FalseVal.getOperand(0);
3311
3312 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3313 ? TrueVal.getValueType()
3314 : FalseVal.getValueType();
3315 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3316 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3317 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3318 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3319 }
3320
3321 // Otherwise, expand the select into a series of logical operations. These
3322 // often can be folded into other operations either by us or ptxas.
3323 TrueVal = DAG.getFreeze(TrueVal);
3324 FalseVal = DAG.getFreeze(FalseVal);
3325 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3326 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3327 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3328 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3329 return Or;
3330}
3331
3333 SDNode *N = Op.getNode();
3334
3335 SDValue Chain = N->getOperand(0);
3336 SDValue Val = N->getOperand(1);
3337 SDValue BasePtr = N->getOperand(2);
3338 SDValue Offset = N->getOperand(3);
3339 SDValue Mask = N->getOperand(4);
3340
3341 SDLoc DL(N);
3342 EVT ValVT = Val.getValueType();
3343 MemSDNode *MemSD = cast<MemSDNode>(N);
3344 assert(ValVT.isVector() && "Masked vector store must have vector type");
3345 assert(MemSD->getAlign() >= DAG.getEVTAlign(ValVT) &&
3346 "Unexpected alignment for masked store");
3347
3348 unsigned Opcode = 0;
3349 switch (ValVT.getSimpleVT().SimpleTy) {
3350 default:
3351 llvm_unreachable("Unexpected masked vector store type");
3352 case MVT::v4i64:
3353 case MVT::v4f64: {
3354 Opcode = NVPTXISD::StoreV4;
3355 break;
3356 }
3357 case MVT::v8i32:
3358 case MVT::v8f32: {
3359 Opcode = NVPTXISD::StoreV8;
3360 break;
3361 }
3362 }
3363
3365
3366 // Construct the new SDNode. First operand is the chain.
3367 Ops.push_back(Chain);
3368
3369 // The next N operands are the values to store. Encode the mask into the
3370 // values using the sentinel register 0 to represent a masked-off element.
3371 assert(Mask.getValueType().isVector() &&
3372 Mask.getValueType().getVectorElementType() == MVT::i1 &&
3373 "Mask must be a vector of i1");
3374 assert(Mask.getOpcode() == ISD::BUILD_VECTOR &&
3375 "Mask expected to be a BUILD_VECTOR");
3376 assert(Mask.getValueType().getVectorNumElements() ==
3377 ValVT.getVectorNumElements() &&
3378 "Mask size must be the same as the vector size");
3379 for (auto [I, Op] : enumerate(Mask->ops())) {
3380 // Mask elements must be constants.
3381 if (Op.getNode()->getAsZExtVal() == 0) {
3382 // Append a sentinel register 0 to the Ops vector to represent a masked
3383 // off element, this will be handled in tablegen
3385 ValVT.getVectorElementType()));
3386 } else {
3387 // Extract the element from the vector to store
3388 SDValue ExtVal =
3390 Val, DAG.getIntPtrConstant(I, DL));
3391 Ops.push_back(ExtVal);
3392 }
3393 }
3394
3395 // Next, the pointer operand.
3396 Ops.push_back(BasePtr);
3397
3398 // Finally, the offset operand. We expect this to always be undef, and it will
3399 // be ignored in lowering, but to mirror the handling of the other vector
3400 // store instructions we include it in the new SDNode.
3401 assert(Offset.getOpcode() == ISD::UNDEF &&
3402 "Offset operand expected to be undef");
3403 Ops.push_back(Offset);
3404
3405 SDValue NewSt =
3406 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3407 MemSD->getMemoryVT(), MemSD->getMemOperand());
3408
3409 return NewSt;
3410}
3411
3412SDValue
3414 switch (Op.getOpcode()) {
3415 case ISD::RETURNADDR:
3416 return SDValue();
3417 case ISD::FRAMEADDR:
3418 return SDValue();
3419 case ISD::ADDRSPACECAST:
3420 return LowerADDRSPACECAST(Op, DAG);
3422 return lowerIntrinsicWChain(Op, DAG);
3424 return lowerIntrinsicWOChain(Op, DAG);
3426 return lowerIntrinsicVoid(Op, DAG);
3427 case ISD::BUILD_VECTOR:
3428 return LowerBUILD_VECTOR(Op, DAG);
3429 case ISD::BITCAST:
3430 return LowerBITCAST(Op, DAG);
3432 return Op;
3434 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3436 return LowerINSERT_VECTOR_ELT(Op, DAG);
3438 return LowerVECTOR_SHUFFLE(Op, DAG);
3440 return LowerCONCAT_VECTORS(Op, DAG);
3445 return LowerVECREDUCE(Op, DAG);
3446 case ISD::STORE:
3447 return LowerSTORE(Op, DAG);
3448 case ISD::MSTORE: {
3449 assert(STI.has256BitVectorLoadStore(
3450 cast<MemSDNode>(Op.getNode())->getAddressSpace()) &&
3451 "Masked store vector not supported on subtarget.");
3452 return lowerMSTORE(Op, DAG);
3453 }
3454 case ISD::LOAD:
3455 return LowerLOAD(Op, DAG);
3456 case ISD::MLOAD:
3457 return LowerMLOAD(Op, DAG);
3458 case ISD::SHL_PARTS:
3459 return LowerShiftLeftParts(Op, DAG);
3460 case ISD::SRA_PARTS:
3461 case ISD::SRL_PARTS:
3462 return LowerShiftRightParts(Op, DAG);
3463 case ISD::SELECT:
3464 return lowerSELECT(Op, DAG);
3465 case ISD::FROUND:
3466 return LowerFROUND(Op, DAG);
3467 case ISD::FCOPYSIGN:
3468 return LowerFCOPYSIGN(Op, DAG);
3469 case ISD::SINT_TO_FP:
3470 case ISD::UINT_TO_FP:
3471 return LowerINT_TO_FP(Op, DAG);
3472 case ISD::FP_TO_SINT:
3473 case ISD::FP_TO_UINT:
3474 return LowerFP_TO_INT(Op, DAG);
3475 case ISD::FP_ROUND:
3476 return LowerFP_ROUND(Op, DAG);
3477 case ISD::FP_EXTEND:
3478 return LowerFP_EXTEND(Op, DAG);
3479 case ISD::VAARG:
3480 return LowerVAARG(Op, DAG);
3481 case ISD::VASTART:
3482 return LowerVASTART(Op, DAG);
3483 case ISD::FSHL:
3484 case ISD::FSHR:
3485 return lowerFSH(Op, DAG);
3486 case ISD::ROTL:
3487 case ISD::ROTR:
3488 return lowerROT(Op, DAG);
3489 case ISD::ABS:
3490 case ISD::SMIN:
3491 case ISD::SMAX:
3492 case ISD::UMIN:
3493 case ISD::UMAX:
3494 case ISD::ADD:
3495 case ISD::SUB:
3496 case ISD::MUL:
3497 case ISD::SHL:
3498 case ISD::SREM:
3499 case ISD::UREM:
3500 return LowerVectorArith(Op, DAG);
3502 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3503 case ISD::STACKRESTORE:
3504 return LowerSTACKRESTORE(Op, DAG);
3505 case ISD::STACKSAVE:
3506 return LowerSTACKSAVE(Op, DAG);
3507 case ISD::CopyToReg:
3508 return LowerCopyToReg_128(Op, DAG);
3509 case ISD::FADD:
3510 case ISD::FSUB:
3511 case ISD::FMUL:
3512 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3513 return PromoteBinOpIfF32FTZ(Op, DAG);
3514 case ISD::CTPOP:
3515 case ISD::CTLZ:
3516 return lowerCTLZCTPOP(Op, DAG);
3517 case ISD::FREM:
3518 return lowerFREM(Op, DAG);
3519 case ISD::BSWAP:
3520 return lowerBSWAP(Op, DAG);
3521 default:
3522 llvm_unreachable("Custom lowering not defined for operation");
3523 }
3524}
3525
3526// This will prevent AsmPrinter from trying to print the jump tables itself.
3530
3531SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3532 SelectionDAG &DAG) const {
3534 unsigned SrcAS = N->getSrcAddressSpace();
3535 unsigned DestAS = N->getDestAddressSpace();
3536 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3537 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3538 // Shared and SharedCluster can be converted to each other through generic
3539 // space
3540 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3543 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3544 SDLoc DL(Op.getNode());
3545 const MVT GenerictVT =
3547 SDValue GenericConversion = DAG.getAddrSpaceCast(
3548 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3549 SDValue SharedClusterConversion =
3550 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3551 ADDRESS_SPACE_GENERIC, DestAS);
3552 return SharedClusterConversion;
3553 }
3554
3555 return DAG.getUNDEF(Op.getValueType());
3556 }
3557
3558 return Op;
3559}
3560
3561// This function is almost a copy of SelectionDAG::expandVAArg().
3562// The only diff is that this one produces loads from local address space.
3563SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3564 const TargetLowering *TLI = STI.getTargetLowering();
3565 SDLoc DL(Op);
3566
3567 SDNode *Node = Op.getNode();
3568 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3569 EVT VT = Node->getValueType(0);
3570 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3571 SDValue Tmp1 = Node->getOperand(0);
3572 SDValue Tmp2 = Node->getOperand(1);
3573 const MaybeAlign MA(Node->getConstantOperandVal(3));
3574
3575 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3576 Tmp1, Tmp2, MachinePointerInfo(V));
3577 SDValue VAList = VAListLoad;
3578
3579 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3580 VAList = DAG.getNode(
3581 ISD::ADD, DL, VAList.getValueType(), VAList,
3582 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3583
3584 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3585 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3586 VAList.getValueType()));
3587 }
3588
3589 // Increment the pointer, VAList, to the next vaarg
3590 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3592 DL, VAList.getValueType()));
3593
3594 // Store the incremented VAList to the legalized pointer
3595 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3596 MachinePointerInfo(V));
3597
3598 const Value *SrcV = Constant::getNullValue(
3600
3601 // Load the actual argument out of the pointer VAList
3602 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3603}
3604
3605SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3606 const TargetLowering *TLI = STI.getTargetLowering();
3607 SDLoc DL(Op);
3608 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3609
3610 // Store the address of unsized array <function>_vararg[] in the ap object.
3611 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3612
3613 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3614 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3615 MachinePointerInfo(SV));
3616}
3617
3618static std::pair<MemSDNode *, uint32_t>
3620 const NVPTXSubtarget &STI) {
3621 SDValue Chain = N->getOperand(0);
3622 SDValue BasePtr = N->getOperand(1);
3623 SDValue Mask = N->getOperand(3);
3624 [[maybe_unused]] SDValue Passthru = N->getOperand(4);
3625
3626 SDLoc DL(N);
3627 EVT ResVT = N->getValueType(0);
3628 assert(ResVT.isVector() && "Masked vector load must have vector type");
3629 // While we only expect poison passthru vectors as an input to the backend,
3630 // when the legalization framework splits a poison vector in half, it creates
3631 // two undef vectors, so we can technically expect those too.
3632 assert((Passthru.getOpcode() == ISD::POISON ||
3633 Passthru.getOpcode() == ISD::UNDEF) &&
3634 "Passthru operand expected to be poison or undef");
3635
3636 // Extract the mask and convert it to a uint32_t representing the used bytes
3637 // of the entire vector load
3638 uint32_t UsedBytesMask = 0;
3639 uint32_t ElementSizeInBits = ResVT.getVectorElementType().getSizeInBits();
3640 assert(ElementSizeInBits % 8 == 0 && "Unexpected element size");
3641 uint32_t ElementSizeInBytes = ElementSizeInBits / 8;
3642 uint32_t ElementMask = (1u << ElementSizeInBytes) - 1u;
3643
3644 for (SDValue Op : reverse(Mask->ops())) {
3645 // We technically only want to do this shift for every
3646 // iteration *but* the first, but in the first iteration UsedBytesMask is 0,
3647 // so this shift is a no-op.
3648 UsedBytesMask <<= ElementSizeInBytes;
3649
3650 // Mask elements must be constants.
3651 if (Op->getAsZExtVal() != 0)
3652 UsedBytesMask |= ElementMask;
3653 }
3654
3655 assert(UsedBytesMask != 0 && UsedBytesMask != UINT32_MAX &&
3656 "Unexpected masked load with elements masked all on or all off");
3657
3658 // Create a new load sd node to be handled normally by ReplaceLoadVector.
3659 MemSDNode *NewLD = cast<MemSDNode>(
3660 DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());
3661
3662 // If our subtarget does not support the used bytes mask pragma, "drop" the
3663 // mask by setting it to UINT32_MAX
3664 if (!STI.hasUsedBytesMaskPragma())
3665 UsedBytesMask = UINT32_MAX;
3666
3667 return {NewLD, UsedBytesMask};
3668}
3669
3670/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3671static std::optional<std::pair<SDValue, SDValue>>
3674 const EVT ResVT = LD->getValueType(0);
3675 const EVT MemVT = LD->getMemoryVT();
3676
3677 // If we're doing sign/zero extension as part of the load, avoid lowering to
3678 // a LoadV node. TODO: consider relaxing this restriction.
3679 if (ResVT != MemVT)
3680 return std::nullopt;
3681
3682 const auto NumEltsAndEltVT =
3683 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3684 if (!NumEltsAndEltVT)
3685 return std::nullopt;
3686 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3687
3688 Align Alignment = LD->getAlign();
3689 const auto &TD = DAG.getDataLayout();
3690 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3691 if (Alignment < PrefAlign) {
3692 // This load is not sufficiently aligned, so bail out and let this vector
3693 // load be scalarized. Note that we may still be able to emit smaller
3694 // vector loads. For example, if we are loading a <4 x float> with an
3695 // alignment of 8, this check will fail but the legalizer will try again
3696 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3697 return std::nullopt;
3698 }
3699
3700 // If we have a masked load, convert it to a normal load now
3701 std::optional<uint32_t> UsedBytesMask = std::nullopt;
3702 if (LD->getOpcode() == ISD::MLOAD)
3703 std::tie(LD, UsedBytesMask) =
3705
3706 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3707 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3708 // loaded type to i16 and propagate the "real" type as the memory type.
3709 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3710
3711 unsigned Opcode;
3712 switch (NumElts) {
3713 default:
3714 return std::nullopt;
3715 case 2:
3716 Opcode = NVPTXISD::LoadV2;
3717 break;
3718 case 4:
3719 Opcode = NVPTXISD::LoadV4;
3720 break;
3721 case 8:
3722 Opcode = NVPTXISD::LoadV8;
3723 break;
3724 }
3725 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3726 ListVTs.push_back(MVT::Other);
3727 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3728
3729 SDLoc DL(LD);
3730
3731 // Copy regular operands
3732 SmallVector<SDValue, 8> OtherOps(LD->ops());
3733
3734 OtherOps.push_back(
3735 DAG.getConstant(UsedBytesMask.value_or(UINT32_MAX), DL, MVT::i32));
3736
3737 // The select routine does not have access to the LoadSDNode instance, so
3738 // pass along the extension information
3739 OtherOps.push_back(
3740 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3741
3742 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3743 LD->getMemOperand());
3744
3745 SmallVector<SDValue> ScalarRes;
3746 if (EltVT.isVector()) {
3748 assert(NumElts * EltVT.getVectorNumElements() ==
3749 ResVT.getVectorNumElements());
3750 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3751 // into individual elements.
3752 for (const unsigned I : llvm::seq(NumElts)) {
3753 SDValue SubVector = NewLD.getValue(I);
3754 DAG.ExtractVectorElements(SubVector, ScalarRes);
3755 }
3756 } else {
3757 for (const unsigned I : llvm::seq(NumElts)) {
3758 SDValue Res = NewLD.getValue(I);
3759 if (LoadEltVT != EltVT)
3760 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3761 ScalarRes.push_back(Res);
3762 }
3763 }
3764
3765 SDValue LoadChain = NewLD.getValue(NumElts);
3766
3767 const MVT BuildVecVT =
3768 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3769 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3770 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3771
3772 return {{LoadValue, LoadChain}};
3773}
3774
3777 const NVPTXSubtarget &STI) {
3778 if (auto Res = replaceLoadVector(N, DAG, STI))
3779 Results.append({Res->first, Res->second});
3780}
3781
3783 const NVPTXSubtarget &STI) {
3784 if (auto Res = replaceLoadVector(N, DAG, STI))
3785 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3786 return SDValue();
3787}
3788
3789// v = ld i1* addr
3790// =>
3791// v1 = ld i8* addr (-> i16)
3792// v = trunc i16 to i1
3794 SDLoc dl(LD);
3795 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3796 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3797 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3798 LD->getBasePtr(), LD->getPointerInfo(),
3799 MVT::i8, LD->getAlign(),
3800 LD->getMemOperand()->getFlags());
3801 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3802 // The legalizer (the caller) is expecting two values from the legalized
3803 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3804 // in LegalizeDAG.cpp which also uses MergeValues.
3805 return DAG.getMergeValues({result, LD->getChain()}, dl);
3806}
3807
3808SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3809 LoadSDNode *LD = cast<LoadSDNode>(Op);
3810
3811 if (Op.getValueType() == MVT::i1)
3812 return lowerLOADi1(LD, DAG);
3813
3814 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3815 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3816 // we allow for more DAG combine opportunities.
3817 if (LD->getExtensionType() == ISD::EXTLOAD) {
3818 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3819 "Unexpected fpext-load");
3820 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3821 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3822 LD->getMemOperand());
3823 }
3824
3825 llvm_unreachable("Unexpected custom lowering for load");
3826}
3827
3828SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
3829 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
3830 // masked loads of these types and have to handle them here.
3831 // v2f32 also needs to be handled here if the subtarget has f32x2
3832 // instructions, making it legal.
3833 //
3834 // Note: misaligned masked loads should never reach this point
3835 // because the override of isLegalMaskedLoad in NVPTXTargetTransformInfo.cpp
3836 // will validate alignment. Therefore, we do not need to special case handle
3837 // them here.
3838 EVT VT = Op.getValueType();
3839 if (NVPTX::isPackedVectorTy(VT)) {
3841 cast<MemSDNode>(Op.getNode()), DAG, STI);
3842 MemSDNode *LD = std::get<0>(Result);
3843 uint32_t UsedBytesMask = std::get<1>(Result);
3844
3845 SDLoc DL(LD);
3846
3847 // Copy regular operands
3848 SmallVector<SDValue, 8> OtherOps(LD->ops());
3849
3850 OtherOps.push_back(DAG.getConstant(UsedBytesMask, DL, MVT::i32));
3851
3852 // We currently are not lowering extending loads, but pass the extension
3853 // type anyway as later handling expects it.
3854 OtherOps.push_back(
3855 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3856 SDValue NewLD =
3857 DAG.getMemIntrinsicNode(NVPTXISD::MLoad, DL, LD->getVTList(), OtherOps,
3858 LD->getMemoryVT(), LD->getMemOperand());
3859 return NewLD;
3860 }
3861 return SDValue();
3862}
3863
3865 const NVPTXSubtarget &STI) {
3866 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3867 SDValue Val = N->getOperand(1);
3868 SDLoc DL(N);
3869 const EVT ValVT = Val.getValueType();
3870 const EVT MemVT = N->getMemoryVT();
3871
3872 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3873 // TODO: consider relaxing this restriction.
3874 if (ValVT != MemVT)
3875 return SDValue();
3876
3877 const auto NumEltsAndEltVT =
3878 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3879 if (!NumEltsAndEltVT)
3880 return SDValue();
3881 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3882
3883 const DataLayout &TD = DAG.getDataLayout();
3884
3885 Align Alignment = N->getAlign();
3886 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3887 if (Alignment < PrefAlign) {
3888 // This store is not sufficiently aligned, so bail out and let this vector
3889 // store be scalarized. Note that we may still be able to emit smaller
3890 // vector stores. For example, if we are storing a <4 x float> with an
3891 // alignment of 8, this check will fail but the legalizer will try again
3892 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3893 return SDValue();
3894 }
3895
3896 unsigned Opcode;
3897 switch (NumElts) {
3898 default:
3899 return SDValue();
3900 case 2:
3901 Opcode = NVPTXISD::StoreV2;
3902 break;
3903 case 4:
3904 Opcode = NVPTXISD::StoreV4;
3905 break;
3906 case 8:
3907 Opcode = NVPTXISD::StoreV8;
3908 break;
3909 }
3910
3912
3913 // First is the chain
3914 Ops.push_back(N->getOperand(0));
3915
3916 // Then the split values
3917 if (EltVT.isVector()) {
3919 assert(NumElts * EltVT.getVectorNumElements() ==
3920 ValVT.getVectorNumElements());
3921 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3922 // stored as b32s
3923 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3924 for (const unsigned I : llvm::seq(NumElts)) {
3925 SmallVector<SDValue, 4> SubVectorElts;
3926 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3927 NumEltsPerSubVector);
3928 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3929 }
3930 } else {
3931 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3932 for (const unsigned I : llvm::seq(NumElts)) {
3933 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3934 DAG.getIntPtrConstant(I, DL));
3935
3936 // Since StoreV2 is a target node, we cannot rely on DAG type
3937 // legalization. Therefore, we must ensure the type is legal. For i1 and
3938 // i8, we set the stored type to i16 and propagate the "real" type as the
3939 // memory type.
3940 if (EltVT.getSizeInBits() < 16)
3941 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3942 Ops.push_back(ExtVal);
3943 }
3944 }
3945
3946 // Then any remaining arguments
3947 Ops.append(N->op_begin() + 2, N->op_end());
3948
3949 SDValue NewSt =
3950 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3951 N->getMemoryVT(), N->getMemOperand());
3952
3953 // return DCI.CombineTo(N, NewSt, true);
3954 return NewSt;
3955}
3956
3957SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3958 StoreSDNode *Store = cast<StoreSDNode>(Op);
3959 EVT VT = Store->getMemoryVT();
3960
3961 if (VT == MVT::i1)
3962 return LowerSTOREi1(Op, DAG);
3963
3964 // Lower store of any other vector type, including v2f32 as we want to break
3965 // it apart since this is not a widely-supported type.
3966 return lowerSTOREVector(Op, DAG, STI);
3967}
3968
3969// st i1 v, addr
3970// =>
3971// v1 = zxt v to i16
3972// st.u8 i16, addr
3973SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3974 SDNode *Node = Op.getNode();
3975 SDLoc dl(Node);
3976 StoreSDNode *ST = cast<StoreSDNode>(Node);
3977 SDValue Tmp1 = ST->getChain();
3978 SDValue Tmp2 = ST->getBasePtr();
3979 SDValue Tmp3 = ST->getValue();
3980 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3981 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3982 SDValue Result =
3983 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3984 ST->getAlign(), ST->getMemOperand()->getFlags());
3985 return Result;
3986}
3987
3988SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3989 SelectionDAG &DAG) const {
3990 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3991 // operand so that it can pass the legalization.
3992
3993 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3994 "Custom lowering for 128-bit CopyToReg only");
3995
3996 SDNode *Node = Op.getNode();
3997 SDLoc DL(Node);
3998
3999 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
4000 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
4001 DAG.getIntPtrConstant(0, DL));
4002 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
4003 DAG.getIntPtrConstant(1, DL));
4004
4006 SmallVector<EVT, 3> ResultsType(Node->values());
4007
4008 NewOps[0] = Op->getOperand(0); // Chain
4009 NewOps[1] = Op->getOperand(1); // Dst Reg
4010 NewOps[2] = Lo; // Lower 64-bit
4011 NewOps[3] = Hi; // Higher 64-bit
4012 if (Op.getNumOperands() == 4)
4013 NewOps[4] = Op->getOperand(3); // Glue if exists
4014
4015 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
4016}
4017
4018unsigned NVPTXTargetLowering::getNumRegisters(
4019 LLVMContext &Context, EVT VT,
4020 std::optional<MVT> RegisterVT = std::nullopt) const {
4021 if (VT == MVT::i128 && RegisterVT == MVT::i128)
4022 return 1;
4023 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
4024}
4025
4026bool NVPTXTargetLowering::splitValueIntoRegisterParts(
4027 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4028 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4029 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
4030 Parts[0] = Val;
4031 return true;
4032 }
4033 return false;
4034}
4035
4036// This creates target external symbol for a function parameter.
4037// Name of the symbol is composed from its index and the function name.
4038// Negative index corresponds to special parameter (unsized array) used for
4039// passing variable arguments.
4040SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
4041 EVT T) const {
4042 StringRef SavedStr = nvTM->getStrPool().save(
4044 return DAG.getExternalSymbol(SavedStr.data(), T);
4045}
4046
4047SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
4048 EVT T) const {
4049 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
4050 return DAG.getExternalSymbol(SavedStr.data(), T);
4051}
4052
4054 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4055 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4056 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4057 const DataLayout &DL = DAG.getDataLayout();
4058 LLVMContext &Ctx = *DAG.getContext();
4059 auto PtrVT = getPointerTy(DAG.getDataLayout());
4060
4061 const Function &F = DAG.getMachineFunction().getFunction();
4062
4063 SDValue Root = DAG.getRoot();
4064 SmallVector<SDValue, 16> OutChains;
4065
4066 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
4067 // Ins.size() will be larger
4068 // * if there is an aggregate argument with multiple fields (each field
4069 // showing up separately in Ins)
4070 // * if there is a vector argument with more than typical vector-length
4071 // elements (generally if more than 4) where each vector element is
4072 // individually present in Ins.
4073 // So a different index should be used for indexing into Ins.
4074 // See similar issue in LowerCall.
4075
4076 auto AllIns = ArrayRef(Ins);
4077 for (const auto &Arg : F.args()) {
4078 const auto ArgIns = AllIns.take_while(
4079 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
4080 AllIns = AllIns.drop_front(ArgIns.size());
4081
4082 Type *Ty = Arg.getType();
4083
4084 if (ArgIns.empty())
4085 report_fatal_error("Empty parameter types are not supported");
4086
4087 if (Arg.use_empty()) {
4088 // argument is dead
4089 for (const auto &In : ArgIns) {
4090 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
4091 InVals.push_back(DAG.getUNDEF(In.VT));
4092 }
4093 continue;
4094 }
4095
4096 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
4097
4098 // In the following cases, assign a node order of "i+1"
4099 // to newly created nodes. The SDNodes for params have to
4100 // appear in the same order as their order of appearance
4101 // in the original function. "i+1" holds that order.
4102 if (Arg.hasByValAttr()) {
4103 // Param has ByVal attribute
4104 // Return MoveParam(param symbol).
4105 // Ideally, the param symbol can be returned directly,
4106 // but when SDNode builder decides to use it in a CopyToReg(),
4107 // machine instruction fails because TargetExternalSymbol
4108 // (not lowered) is target dependent, and CopyToReg assumes
4109 // the source is lowered.
4110 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
4111 const auto &ByvalIn = ArgIns[0];
4112 assert(getValueType(DL, Ty) == ByvalIn.VT &&
4113 "Ins type did not match function type");
4114 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
4115
4116 SDValue P;
4117 if (isKernelFunction(F)) {
4118 P = ArgSymbol;
4119 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4120 } else {
4121 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
4122 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4123 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
4125 }
4126 InVals.push_back(P);
4127 } else {
4130 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
4131 assert(VTs.size() == ArgIns.size() && "Size mismatch");
4132 assert(VTs.size() == Offsets.size() && "Size mismatch");
4133
4134 const Align ArgAlign = getFunctionArgumentAlignment(
4135 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
4136
4137 unsigned I = 0;
4138 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
4139 for (const unsigned NumElts : VI) {
4140 // i1 is loaded/stored as i8
4141 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
4142 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
4143
4144 SDValue VecAddr = DAG.getObjectPtrOffset(
4145 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
4146
4147 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
4148 SDValue P =
4149 DAG.getLoad(VecVT, dl, Root, VecAddr,
4153 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4154 for (const unsigned J : llvm::seq(NumElts)) {
4155 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
4156
4157 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
4158 DAG, dl);
4159 InVals.push_back(Elt);
4160 }
4161 I += NumElts;
4162 }
4163 }
4164 }
4165
4166 if (!OutChains.empty())
4167 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
4168
4169 return Chain;
4170}
4171
4172SDValue
4174 bool isVarArg,
4176 const SmallVectorImpl<SDValue> &OutVals,
4177 const SDLoc &dl, SelectionDAG &DAG) const {
4178 const Function &F = DAG.getMachineFunction().getFunction();
4179 Type *RetTy = F.getReturnType();
4180
4181 if (RetTy->isVoidTy()) {
4182 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
4183 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4184 }
4185
4186 const DataLayout &DL = DAG.getDataLayout();
4187 LLVMContext &Ctx = *DAG.getContext();
4188
4189 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
4190 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
4191
4192 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
4193 // 32-bits are sign extended or zero extended, depending on whether
4194 // they are signed or unsigned types.
4195 const bool ExtendIntegerRetVal =
4196 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
4197
4200 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
4201 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
4202
4203 const auto GetRetVal = [&](unsigned I) -> SDValue {
4204 SDValue RetVal = OutVals[I];
4206 RetVal.getValueType() &&
4207 "OutVal type should always be legal");
4208
4209 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
4210 const EVT StoreVT =
4211 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
4212 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
4213 };
4214
4215 unsigned I = 0;
4216 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
4217 for (const unsigned NumElts : VI) {
4218 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
4219 ? MaybeAlign(std::nullopt)
4220 : commonAlignment(RetAlign, Offsets[I]);
4221
4223 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
4224
4225 SDValue Ptr =
4226 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
4227
4228 Chain = DAG.getStore(Chain, dl, Val, Ptr,
4230
4231 I += NumElts;
4232 }
4233
4234 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4235}
4236
4238 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
4239 SelectionDAG &DAG) const {
4240 if (Constraint.size() > 1)
4241 return;
4243}
4244
4245// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4246// TgtMemIntrinsic
4247// because we need the information that is only available in the "Value" type
4248// of destination
4249// pointer. In particular, the address space information.
4251 const CallBase &I,
4252 MachineFunction &MF,
4253 unsigned Intrinsic) const {
4254 switch (Intrinsic) {
4255 default:
4256 return false;
4257 case Intrinsic::nvvm_match_all_sync_i32p:
4258 case Intrinsic::nvvm_match_all_sync_i64p:
4259 Info.opc = ISD::INTRINSIC_W_CHAIN;
4260 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4261 // in order to model data exchange with other threads, but perform no real
4262 // memory accesses.
4263 Info.memVT = MVT::i1;
4264
4265 // Our result depends on both our and other thread's arguments.
4267 return true;
4268 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4269 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4270 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4271 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4272 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4273 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4274 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4275 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4276 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4277 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4278 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4279 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4280 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4281 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4282 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4283 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4284 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4285 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4286 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4287 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4288 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4289 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4290 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4291 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4292 Info.opc = ISD::INTRINSIC_W_CHAIN;
4293 Info.memVT = MVT::v8f16;
4294 Info.ptrVal = I.getArgOperand(0);
4295 Info.offset = 0;
4296 Info.flags = MachineMemOperand::MOLoad;
4297 Info.align = Align(16);
4298 return true;
4299 }
4300 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4301 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4302 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4303 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4304 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4305 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4306 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4307 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4308 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4309 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4310 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4311 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4312 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4313 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4314 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4315 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4316 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4317 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4318 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4319 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4320 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4321 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4322 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4323 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4324 Info.opc = ISD::INTRINSIC_W_CHAIN;
4325 Info.memVT = MVT::v2i32;
4326 Info.ptrVal = I.getArgOperand(0);
4327 Info.offset = 0;
4328 Info.flags = MachineMemOperand::MOLoad;
4329 Info.align = Align(8);
4330 return true;
4331 }
4332
4333 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4334 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4335 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4336 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4337 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4338 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4339 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4340 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4341 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4342 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4343 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4344 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4345 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4346 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4347 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4348 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4349
4350 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4351 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4352 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4353 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4354 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4355 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4356 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4357 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4358 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4359 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4360 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4361 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4362 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4363 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4364 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4365 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4366 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4367 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
4368 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
4369 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
4370 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
4371 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
4372 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
4373 Info.opc = ISD::INTRINSIC_W_CHAIN;
4374 Info.memVT = MVT::v4i32;
4375 Info.ptrVal = I.getArgOperand(0);
4376 Info.offset = 0;
4377 Info.flags = MachineMemOperand::MOLoad;
4378 Info.align = Align(16);
4379 return true;
4380 }
4381
4382 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4383 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4384 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4385 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4386 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4387 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4388 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4389 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4390
4391 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4392 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4393 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4394 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4395 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4396 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4397 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4398 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4399 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4400 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4401 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4402 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4403 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4404 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4405 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4406 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4407 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4408 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4409 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4410 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4411 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4412 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4413 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4414 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4415 Info.opc = ISD::INTRINSIC_W_CHAIN;
4416 Info.memVT = MVT::i32;
4417 Info.ptrVal = I.getArgOperand(0);
4418 Info.offset = 0;
4419 Info.flags = MachineMemOperand::MOLoad;
4420 Info.align = Align(4);
4421 return true;
4422 }
4423
4424 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4425 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4426 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4427 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4428 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4429 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4430 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4431 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4432 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4433 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4434 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4435 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4436 Info.opc = ISD::INTRINSIC_W_CHAIN;
4437 Info.memVT = MVT::v4f16;
4438 Info.ptrVal = I.getArgOperand(0);
4439 Info.offset = 0;
4440 Info.flags = MachineMemOperand::MOLoad;
4441 Info.align = Align(16);
4442 return true;
4443 }
4444
4445 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4446 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4447 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4448 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4449 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4450 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4451 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4452 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4453 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4454 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4455 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4456 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4457 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4458 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4459 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4460 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4461 Info.opc = ISD::INTRINSIC_W_CHAIN;
4462 Info.memVT = MVT::v8f32;
4463 Info.ptrVal = I.getArgOperand(0);
4464 Info.offset = 0;
4465 Info.flags = MachineMemOperand::MOLoad;
4466 Info.align = Align(16);
4467 return true;
4468 }
4469
4470 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4471 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4472 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4473 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4474
4475 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4476 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4477 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4478 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4479
4480 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4481 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4482 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4483 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4484 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4485 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4486 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4487 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4488 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4489 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4490 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4491 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4492 Info.opc = ISD::INTRINSIC_W_CHAIN;
4493 Info.memVT = MVT::v8i32;
4494 Info.ptrVal = I.getArgOperand(0);
4495 Info.offset = 0;
4496 Info.flags = MachineMemOperand::MOLoad;
4497 Info.align = Align(16);
4498 return true;
4499 }
4500
4501 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4502 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4503 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4504 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4505 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4506 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4507 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4508 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4509 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4510 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4511 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4512 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4513 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4514 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4515 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4516 Info.opc = ISD::INTRINSIC_W_CHAIN;
4517 Info.memVT = MVT::v2i32;
4518 Info.ptrVal = I.getArgOperand(0);
4519 Info.offset = 0;
4520 Info.flags = MachineMemOperand::MOLoad;
4521 Info.align = Align(8);
4522 return true;
4523 }
4524
4525 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4526 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4527 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4528 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4529
4530 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4531 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4532 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4533 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4534 Info.opc = ISD::INTRINSIC_W_CHAIN;
4535 Info.memVT = MVT::f64;
4536 Info.ptrVal = I.getArgOperand(0);
4537 Info.offset = 0;
4538 Info.flags = MachineMemOperand::MOLoad;
4539 Info.align = Align(8);
4540 return true;
4541 }
4542
4543 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4544 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4545 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4546 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4547 Info.opc = ISD::INTRINSIC_W_CHAIN;
4548 Info.memVT = MVT::v2f64;
4549 Info.ptrVal = I.getArgOperand(0);
4550 Info.offset = 0;
4551 Info.flags = MachineMemOperand::MOLoad;
4552 Info.align = Align(16);
4553 return true;
4554 }
4555
4556 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4557 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4558 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4559 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4560 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4561 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4562 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4563 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4564 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4565 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4566 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4567 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4568 Info.opc = ISD::INTRINSIC_VOID;
4569 Info.memVT = MVT::v4f16;
4570 Info.ptrVal = I.getArgOperand(0);
4571 Info.offset = 0;
4572 Info.flags = MachineMemOperand::MOStore;
4573 Info.align = Align(16);
4574 return true;
4575 }
4576
4577 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4578 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4579 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4580 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4581 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4582 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4583 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4584 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4585 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4586 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4587 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4588 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4589 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4590 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4591 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4592 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4593 Info.opc = ISD::INTRINSIC_VOID;
4594 Info.memVT = MVT::v8f32;
4595 Info.ptrVal = I.getArgOperand(0);
4596 Info.offset = 0;
4597 Info.flags = MachineMemOperand::MOStore;
4598 Info.align = Align(16);
4599 return true;
4600 }
4601
4602 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4603 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4604 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4605 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4606 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4607 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4608 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4609 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4610 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4611 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4612 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4613 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4614 Info.opc = ISD::INTRINSIC_VOID;
4615 Info.memVT = MVT::v8i32;
4616 Info.ptrVal = I.getArgOperand(0);
4617 Info.offset = 0;
4618 Info.flags = MachineMemOperand::MOStore;
4619 Info.align = Align(16);
4620 return true;
4621 }
4622
4623 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4624 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4625 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4626 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4627 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4628 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4629 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4630 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4631 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4632 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4633 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4634 Info.opc = ISD::INTRINSIC_VOID;
4635 Info.memVT = MVT::v2i32;
4636 Info.ptrVal = I.getArgOperand(0);
4637 Info.offset = 0;
4638 Info.flags = MachineMemOperand::MOStore;
4639 Info.align = Align(8);
4640 return true;
4641 }
4642
4643 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4644 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4645 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4646 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4647 Info.opc = ISD::INTRINSIC_VOID;
4648 Info.memVT = MVT::v2f64;
4649 Info.ptrVal = I.getArgOperand(0);
4650 Info.offset = 0;
4651 Info.flags = MachineMemOperand::MOStore;
4652 Info.align = Align(16);
4653 return true;
4654 }
4655
4656 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4657 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4658 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4659 Info.opc = ISD::INTRINSIC_VOID;
4660 Info.memVT = MVT::i32;
4661 Info.ptrVal = I.getArgOperand(0);
4662 Info.offset = 0;
4663 Info.flags = MachineMemOperand::MOStore;
4664 Info.align = Align(4);
4665 return true;
4666 }
4667
4668 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4669 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4670 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4671 Info.opc = ISD::INTRINSIC_VOID;
4672 Info.memVT = MVT::v4i32;
4673 Info.ptrVal = I.getArgOperand(0);
4674 Info.offset = 0;
4675 Info.flags = MachineMemOperand::MOStore;
4676 Info.align = Align(16);
4677 return true;
4678 }
4679
4680 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4681 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4682 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4683 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4684 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4685 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4686 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4687 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4688 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4689 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4690 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4691 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4692 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4693 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4694 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4695 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4696 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4697 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4698 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4699 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4700 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4701 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4702 auto &DL = I.getDataLayout();
4703 Info.opc = ISD::INTRINSIC_W_CHAIN;
4704 Info.memVT = getValueType(DL, I.getType());
4705 Info.ptrVal = I.getArgOperand(0);
4706 Info.offset = 0;
4708 Info.align.reset();
4709 return true;
4710 }
4711
4712 case Intrinsic::nvvm_prefetch_tensormap: {
4713 auto &DL = I.getDataLayout();
4714 Info.opc = ISD::INTRINSIC_VOID;
4715 Info.memVT = getPointerTy(DL);
4716 Info.ptrVal = I.getArgOperand(0);
4717 Info.offset = 0;
4718 Info.flags =
4720 Info.align.reset();
4721 return true;
4722 }
4723
4724 case Intrinsic::nvvm_tensormap_replace_global_address:
4725 case Intrinsic::nvvm_tensormap_replace_global_stride: {
4726 Info.opc = ISD::INTRINSIC_VOID;
4727 Info.memVT = MVT::i64;
4728 Info.ptrVal = I.getArgOperand(0);
4729 Info.offset = 0;
4730 Info.flags = MachineMemOperand::MOStore;
4731 Info.align.reset();
4732 return true;
4733 }
4734
4735 case Intrinsic::nvvm_tensormap_replace_rank:
4736 case Intrinsic::nvvm_tensormap_replace_box_dim:
4737 case Intrinsic::nvvm_tensormap_replace_global_dim:
4738 case Intrinsic::nvvm_tensormap_replace_element_stride:
4739 case Intrinsic::nvvm_tensormap_replace_elemtype:
4740 case Intrinsic::nvvm_tensormap_replace_interleave_layout:
4741 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
4742 case Intrinsic::nvvm_tensormap_replace_swizzle_atomicity:
4743 case Intrinsic::nvvm_tensormap_replace_fill_mode: {
4744 Info.opc = ISD::INTRINSIC_VOID;
4745 Info.memVT = MVT::i32;
4746 Info.ptrVal = I.getArgOperand(0);
4747 Info.offset = 0;
4748 Info.flags = MachineMemOperand::MOStore;
4749 Info.align.reset();
4750 return true;
4751 }
4752
4753 case Intrinsic::nvvm_ldu_global_i:
4754 case Intrinsic::nvvm_ldu_global_f:
4755 case Intrinsic::nvvm_ldu_global_p: {
4756 Info.opc = ISD::INTRINSIC_W_CHAIN;
4757 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4758 Info.ptrVal = I.getArgOperand(0);
4759 Info.offset = 0;
4760 Info.flags = MachineMemOperand::MOLoad;
4761 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4762
4763 return true;
4764 }
4765 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4766 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4767 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4768 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4769 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4770 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4771 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4772 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4773 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4774 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4775 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4776 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4777 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4778 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4779 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4780 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4781 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4782 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4783 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4784 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4785 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4786 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4787 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4788 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4789 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4790 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4791 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4792 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4793 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4794 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4795 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4796 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4797 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4798 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4799 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4800 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4801 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4802 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4803 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4804 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4805 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4806 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4807 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4808 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4809 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4810 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4811 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4812 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4813 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4814 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4815 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4816 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4817 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4818 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4819 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4820 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4821 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4822 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4823 Info.opc = ISD::INTRINSIC_W_CHAIN;
4824 Info.memVT = MVT::v4f32;
4825 Info.ptrVal = nullptr;
4826 Info.offset = 0;
4827 Info.flags = MachineMemOperand::MOLoad;
4828 Info.align = Align(16);
4829 return true;
4830
4831 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4832 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4833 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4834 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4835 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4836 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4837 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4838 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4839 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4840 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4841 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4842 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4843 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4844 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4845 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4846 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4847 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4848 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4849 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4850 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4851 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4852 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4853 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4854 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4855 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4856 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4857 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4858 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4859 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4860 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4861 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4862 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4863 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4864 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4865 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4866 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4867 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4868 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4869 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4870 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4871 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4872 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4873 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4874 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4875 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4876 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4877 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4878 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4879 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4880 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4881 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4882 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4883 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4884 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4885 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4886 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4887 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4888 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4889 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4890 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4891 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4892 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4893 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4894 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4895 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4896 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4897 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4898 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4899 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4900 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4901 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4902 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4903 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4904 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4905 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4906 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4907 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4908 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4909 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4910 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4911 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4912 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4913 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4914 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4915 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4916 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4917 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4918 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4919 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4920 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4921 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4922 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4923 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4924 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4925 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4926 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4927 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4928 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4929 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4930 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4931 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4932 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4933 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4934 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4935 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4936 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4937 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4938 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4939 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4940 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4941 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4942 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4943 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4944 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4945 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4946 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4947 Info.opc = ISD::INTRINSIC_W_CHAIN;
4948 Info.memVT = MVT::v4i32;
4949 Info.ptrVal = nullptr;
4950 Info.offset = 0;
4951 Info.flags = MachineMemOperand::MOLoad;
4952 Info.align = Align(16);
4953 return true;
4954
4955 case Intrinsic::nvvm_suld_1d_i8_clamp:
4956 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4957 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4958 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4959 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4960 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4961 case Intrinsic::nvvm_suld_2d_i8_clamp:
4962 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4963 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4964 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4965 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4966 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4967 case Intrinsic::nvvm_suld_3d_i8_clamp:
4968 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4969 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4970 case Intrinsic::nvvm_suld_1d_i8_trap:
4971 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4972 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4973 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4974 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4975 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4976 case Intrinsic::nvvm_suld_2d_i8_trap:
4977 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4978 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4979 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4980 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4981 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4982 case Intrinsic::nvvm_suld_3d_i8_trap:
4983 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4984 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4985 case Intrinsic::nvvm_suld_1d_i8_zero:
4986 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4987 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4988 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4989 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4990 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4991 case Intrinsic::nvvm_suld_2d_i8_zero:
4992 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4993 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4994 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4995 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4996 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4997 case Intrinsic::nvvm_suld_3d_i8_zero:
4998 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4999 case Intrinsic::nvvm_suld_3d_v4i8_zero:
5000 Info.opc = ISD::INTRINSIC_W_CHAIN;
5001 Info.memVT = MVT::i8;
5002 Info.ptrVal = nullptr;
5003 Info.offset = 0;
5004 Info.flags = MachineMemOperand::MOLoad;
5005 Info.align = Align(16);
5006 return true;
5007
5008 case Intrinsic::nvvm_suld_1d_i16_clamp:
5009 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
5010 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
5011 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
5012 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
5013 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
5014 case Intrinsic::nvvm_suld_2d_i16_clamp:
5015 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
5016 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
5017 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
5018 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
5019 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
5020 case Intrinsic::nvvm_suld_3d_i16_clamp:
5021 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
5022 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
5023 case Intrinsic::nvvm_suld_1d_i16_trap:
5024 case Intrinsic::nvvm_suld_1d_v2i16_trap:
5025 case Intrinsic::nvvm_suld_1d_v4i16_trap:
5026 case Intrinsic::nvvm_suld_1d_array_i16_trap:
5027 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
5028 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
5029 case Intrinsic::nvvm_suld_2d_i16_trap:
5030 case Intrinsic::nvvm_suld_2d_v2i16_trap:
5031 case Intrinsic::nvvm_suld_2d_v4i16_trap:
5032 case Intrinsic::nvvm_suld_2d_array_i16_trap:
5033 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
5034 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
5035 case Intrinsic::nvvm_suld_3d_i16_trap:
5036 case Intrinsic::nvvm_suld_3d_v2i16_trap:
5037 case Intrinsic::nvvm_suld_3d_v4i16_trap:
5038 case Intrinsic::nvvm_suld_1d_i16_zero:
5039 case Intrinsic::nvvm_suld_1d_v2i16_zero:
5040 case Intrinsic::nvvm_suld_1d_v4i16_zero:
5041 case Intrinsic::nvvm_suld_1d_array_i16_zero:
5042 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
5043 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
5044 case Intrinsic::nvvm_suld_2d_i16_zero:
5045 case Intrinsic::nvvm_suld_2d_v2i16_zero:
5046 case Intrinsic::nvvm_suld_2d_v4i16_zero:
5047 case Intrinsic::nvvm_suld_2d_array_i16_zero:
5048 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
5049 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
5050 case Intrinsic::nvvm_suld_3d_i16_zero:
5051 case Intrinsic::nvvm_suld_3d_v2i16_zero:
5052 case Intrinsic::nvvm_suld_3d_v4i16_zero:
5053 Info.opc = ISD::INTRINSIC_W_CHAIN;
5054 Info.memVT = MVT::i16;
5055 Info.ptrVal = nullptr;
5056 Info.offset = 0;
5057 Info.flags = MachineMemOperand::MOLoad;
5058 Info.align = Align(16);
5059 return true;
5060
5061 case Intrinsic::nvvm_suld_1d_i32_clamp:
5062 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
5063 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
5064 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
5065 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
5066 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
5067 case Intrinsic::nvvm_suld_2d_i32_clamp:
5068 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5069 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5070 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5071 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5072 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5073 case Intrinsic::nvvm_suld_3d_i32_clamp:
5074 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5075 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5076 case Intrinsic::nvvm_suld_1d_i32_trap:
5077 case Intrinsic::nvvm_suld_1d_v2i32_trap:
5078 case Intrinsic::nvvm_suld_1d_v4i32_trap:
5079 case Intrinsic::nvvm_suld_1d_array_i32_trap:
5080 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5081 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5082 case Intrinsic::nvvm_suld_2d_i32_trap:
5083 case Intrinsic::nvvm_suld_2d_v2i32_trap:
5084 case Intrinsic::nvvm_suld_2d_v4i32_trap:
5085 case Intrinsic::nvvm_suld_2d_array_i32_trap:
5086 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5087 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5088 case Intrinsic::nvvm_suld_3d_i32_trap:
5089 case Intrinsic::nvvm_suld_3d_v2i32_trap:
5090 case Intrinsic::nvvm_suld_3d_v4i32_trap:
5091 case Intrinsic::nvvm_suld_1d_i32_zero:
5092 case Intrinsic::nvvm_suld_1d_v2i32_zero:
5093 case Intrinsic::nvvm_suld_1d_v4i32_zero:
5094 case Intrinsic::nvvm_suld_1d_array_i32_zero:
5095 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5096 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5097 case Intrinsic::nvvm_suld_2d_i32_zero:
5098 case Intrinsic::nvvm_suld_2d_v2i32_zero:
5099 case Intrinsic::nvvm_suld_2d_v4i32_zero:
5100 case Intrinsic::nvvm_suld_2d_array_i32_zero:
5101 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5102 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5103 case Intrinsic::nvvm_suld_3d_i32_zero:
5104 case Intrinsic::nvvm_suld_3d_v2i32_zero:
5105 case Intrinsic::nvvm_suld_3d_v4i32_zero:
5106 Info.opc = ISD::INTRINSIC_W_CHAIN;
5107 Info.memVT = MVT::i32;
5108 Info.ptrVal = nullptr;
5109 Info.offset = 0;
5110 Info.flags = MachineMemOperand::MOLoad;
5111 Info.align = Align(16);
5112 return true;
5113
5114 case Intrinsic::nvvm_suld_1d_i64_clamp:
5115 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5116 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5117 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5118 case Intrinsic::nvvm_suld_2d_i64_clamp:
5119 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5120 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5121 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5122 case Intrinsic::nvvm_suld_3d_i64_clamp:
5123 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5124 case Intrinsic::nvvm_suld_1d_i64_trap:
5125 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5126 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5127 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5128 case Intrinsic::nvvm_suld_2d_i64_trap:
5129 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5130 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5131 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5132 case Intrinsic::nvvm_suld_3d_i64_trap:
5133 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5134 case Intrinsic::nvvm_suld_1d_i64_zero:
5135 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5136 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5137 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5138 case Intrinsic::nvvm_suld_2d_i64_zero:
5139 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5140 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5141 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5142 case Intrinsic::nvvm_suld_3d_i64_zero:
5143 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5144 Info.opc = ISD::INTRINSIC_W_CHAIN;
5145 Info.memVT = MVT::i64;
5146 Info.ptrVal = nullptr;
5147 Info.offset = 0;
5148 Info.flags = MachineMemOperand::MOLoad;
5149 Info.align = Align(16);
5150 return true;
5151
5152 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
5153 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
5154 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
5155 Info.opc = ISD::INTRINSIC_W_CHAIN;
5156 Info.memVT = MVT::v1i32;
5157 Info.ptrVal = I.getArgOperand(0);
5158 Info.offset = 0;
5159 Info.flags = MachineMemOperand::MOLoad;
5160 Info.align.reset();
5161 return true;
5162 }
5163
5164 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
5165 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
5166 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
5167 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
5168 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
5169 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32: {
5170 Info.opc = ISD::INTRINSIC_W_CHAIN;
5171 Info.memVT = MVT::v2i32;
5172 Info.ptrVal = I.getArgOperand(0);
5173 Info.offset = 0;
5174 Info.flags = MachineMemOperand::MOLoad;
5175 Info.align.reset();
5176 return true;
5177 }
5178
5179 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
5180 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32: {
5181 Info.opc = ISD::INTRINSIC_W_CHAIN;
5182 Info.memVT = MVT::v2f32;
5183 Info.ptrVal = I.getArgOperand(0);
5184 Info.offset = 0;
5185 Info.flags = MachineMemOperand::MOLoad;
5186 Info.align.reset();
5187 return true;
5188 }
5189
5190 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
5191 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
5192 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
5193 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
5194 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
5195 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
5196 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32: {
5197 Info.opc = ISD::INTRINSIC_W_CHAIN;
5198 Info.memVT = MVT::v4i32;
5199 Info.ptrVal = I.getArgOperand(0);
5200 Info.offset = 0;
5201 Info.flags = MachineMemOperand::MOLoad;
5202 Info.align.reset();
5203 return true;
5204 }
5205
5206 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
5207 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32: {
5208 Info.opc = ISD::INTRINSIC_W_CHAIN;
5209 Info.memVT = MVT::v4f32;
5210 Info.ptrVal = I.getArgOperand(0);
5211 Info.offset = 0;
5212 Info.flags = MachineMemOperand::MOLoad;
5213 Info.align.reset();
5214 return true;
5215 }
5216
5217 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
5218 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
5219 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
5220 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
5221 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
5222 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
5223 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32: {
5224 Info.opc = ISD::INTRINSIC_W_CHAIN;
5225 Info.memVT = MVT::v8i32;
5226 Info.ptrVal = I.getArgOperand(0);
5227 Info.offset = 0;
5228 Info.flags = MachineMemOperand::MOLoad;
5229 Info.align.reset();
5230 return true;
5231 }
5232
5233 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
5234 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32: {
5235 Info.opc = ISD::INTRINSIC_W_CHAIN;
5236 Info.memVT = MVT::v8f32;
5237 Info.ptrVal = I.getArgOperand(0);
5238 Info.offset = 0;
5239 Info.flags = MachineMemOperand::MOLoad;
5240 Info.align.reset();
5241 return true;
5242 }
5243
5244 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
5245 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
5246 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
5247 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
5248 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
5249 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
5250 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32: {
5251 Info.opc = ISD::INTRINSIC_W_CHAIN;
5252 Info.memVT = MVT::v16i32;
5253 Info.ptrVal = I.getArgOperand(0);
5254 Info.offset = 0;
5255 Info.flags = MachineMemOperand::MOLoad;
5256 Info.align.reset();
5257 return true;
5258 }
5259
5260 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
5261 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32: {
5262 Info.opc = ISD::INTRINSIC_W_CHAIN;
5263 Info.memVT = MVT::v16f32;
5264 Info.ptrVal = I.getArgOperand(0);
5265 Info.offset = 0;
5266 Info.flags = MachineMemOperand::MOLoad;
5267 Info.align.reset();
5268 return true;
5269 }
5270
5271 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
5272 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
5273 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
5274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
5275 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
5276 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
5277 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32: {
5278 Info.opc = ISD::INTRINSIC_W_CHAIN;
5279 Info.memVT = MVT::v32i32;
5280 Info.ptrVal = I.getArgOperand(0);
5281 Info.offset = 0;
5282 Info.flags = MachineMemOperand::MOLoad;
5283 Info.align.reset();
5284 return true;
5285 }
5286
5287 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
5288 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32: {
5289 Info.opc = ISD::INTRINSIC_W_CHAIN;
5290 Info.memVT = MVT::v32f32;
5291 Info.ptrVal = I.getArgOperand(0);
5292 Info.offset = 0;
5293 Info.flags = MachineMemOperand::MOLoad;
5294 Info.align.reset();
5295 return true;
5296 }
5297
5298 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
5299 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
5300 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
5301 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
5302 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
5303 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
5304 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32: {
5305 Info.opc = ISD::INTRINSIC_W_CHAIN;
5306 Info.memVT = MVT::v64i32;
5307 Info.ptrVal = I.getArgOperand(0);
5308 Info.offset = 0;
5309 Info.flags = MachineMemOperand::MOLoad;
5310 Info.align.reset();
5311 return true;
5312 }
5313
5314 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
5315 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32: {
5316 Info.opc = ISD::INTRINSIC_W_CHAIN;
5317 Info.memVT = MVT::v64f32;
5318 Info.ptrVal = I.getArgOperand(0);
5319 Info.offset = 0;
5320 Info.flags = MachineMemOperand::MOLoad;
5321 Info.align.reset();
5322 return true;
5323 }
5324
5325 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
5326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
5327 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
5328 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
5329 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
5330 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
5331 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32: {
5332 Info.opc = ISD::INTRINSIC_W_CHAIN;
5333 Info.memVT = MVT::v128i32;
5334 Info.ptrVal = I.getArgOperand(0);
5335 Info.offset = 0;
5336 Info.flags = MachineMemOperand::MOLoad;
5337 Info.align.reset();
5338 return true;
5339 }
5340
5341 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
5342 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32: {
5343 Info.opc = ISD::INTRINSIC_W_CHAIN;
5344 Info.memVT = MVT::v128f32;
5345 Info.ptrVal = I.getArgOperand(0);
5346 Info.offset = 0;
5347 Info.flags = MachineMemOperand::MOLoad;
5348 Info.align.reset();
5349 return true;
5350 }
5351
5352 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
5353 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
5354 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
5355 Info.opc = ISD::INTRINSIC_VOID;
5356 Info.memVT = MVT::i32;
5357 Info.ptrVal = I.getArgOperand(0);
5358 Info.offset = 0;
5359 Info.flags = MachineMemOperand::MOStore;
5360 Info.align.reset();
5361 return true;
5362 }
5363
5364 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
5365 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
5366 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
5367 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
5368 Info.opc = ISD::INTRINSIC_VOID;
5369 Info.memVT = MVT::v2i32;
5370 Info.ptrVal = I.getArgOperand(0);
5371 Info.offset = 0;
5372 Info.flags = MachineMemOperand::MOStore;
5373 Info.align.reset();
5374 return true;
5375 }
5376
5377 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
5378 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
5379 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
5380 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
5381 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
5382 Info.opc = ISD::INTRINSIC_VOID;
5383 Info.memVT = MVT::v4i32;
5384 Info.ptrVal = I.getArgOperand(0);
5385 Info.offset = 0;
5386 Info.flags = MachineMemOperand::MOStore;
5387 Info.align.reset();
5388 return true;
5389 }
5390
5391 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
5392 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
5393 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
5394 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
5395 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
5396 Info.opc = ISD::INTRINSIC_VOID;
5397 Info.memVT = MVT::v8i32;
5398 Info.ptrVal = I.getArgOperand(0);
5399 Info.offset = 0;
5400 Info.flags = MachineMemOperand::MOStore;
5401 Info.align.reset();
5402 return true;
5403 }
5404
5405 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
5406 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
5407 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
5408 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
5409 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
5410 Info.opc = ISD::INTRINSIC_VOID;
5411 Info.memVT = MVT::v16i32;
5412 Info.ptrVal = I.getArgOperand(0);
5413 Info.offset = 0;
5414 Info.flags = MachineMemOperand::MOStore;
5415 Info.align.reset();
5416 return true;
5417 }
5418
5419 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
5420 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
5421 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
5422 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
5423 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
5424 Info.opc = ISD::INTRINSIC_VOID;
5425 Info.memVT = MVT::v32i32;
5426 Info.ptrVal = I.getArgOperand(0);
5427 Info.offset = 0;
5428 Info.flags = MachineMemOperand::MOStore;
5429 Info.align.reset();
5430 return true;
5431 }
5432
5433 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
5434 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
5435 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
5436 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
5437 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
5438 Info.opc = ISD::INTRINSIC_VOID;
5439 Info.memVT = MVT::v64i32;
5440 Info.ptrVal = I.getArgOperand(0);
5441 Info.offset = 0;
5442 Info.flags = MachineMemOperand::MOStore;
5443 Info.align.reset();
5444 return true;
5445 }
5446
5447 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
5448 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
5449 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
5450 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
5451 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
5452 Info.opc = ISD::INTRINSIC_VOID;
5453 Info.memVT = MVT::v128i32;
5454 Info.ptrVal = I.getArgOperand(0);
5455 Info.offset = 0;
5456 Info.flags = MachineMemOperand::MOStore;
5457 Info.align.reset();
5458 return true;
5459 }
5460 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
5461 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
5462 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
5463 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
5464 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
5465 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
5466 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
5467 case Intrinsic::
5468 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
5469 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
5470 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
5471 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
5472 case Intrinsic::
5473 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
5474 // We are reading and writing back to TMem
5475 Info.opc = ISD::INTRINSIC_VOID;
5476 Info.memVT = MVT::v4i32;
5477 Info.ptrVal = I.getArgOperand(0);
5478 Info.offset = 0;
5480 Info.align = Align(16);
5481 return true;
5482 }
5483
5484 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
5485 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
5486 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
5487 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
5488 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
5489 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
5490 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
5491 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
5492 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
5493 case Intrinsic::
5494 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
5495 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
5496 case Intrinsic::
5497 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
5498 // We are reading and writing back to TMem
5499 Info.opc = ISD::INTRINSIC_VOID;
5500 Info.memVT = MVT::v8i32;
5501 Info.ptrVal = I.getArgOperand(0);
5502 Info.offset = 0;
5504 Info.align = Align(16);
5505 return true;
5506 }
5507 }
5508 return false;
5509}
5510
5511/// getFunctionParamOptimizedAlign - since function arguments are passed via
5512/// .param space, we may want to increase their alignment in a way that
5513/// ensures that we can effectively vectorize their loads & stores. We can
5514/// increase alignment only if the function has internal or has private
5515/// linkage as for other linkage types callers may already rely on default
5516/// alignment. To allow using 128-bit vectorized loads/stores, this function
5517/// ensures that alignment is 16 or greater.
5519 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5520 // Capping the alignment to 128 bytes as that is the maximum alignment
5521 // supported by PTX.
5522 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5523
5524 // If a function has linkage different from internal or private, we
5525 // must use default ABI alignment as external users rely on it. Same
5526 // for a function that may be called from a function pointer.
5527 if (!F || !F->hasLocalLinkage() ||
5528 F->hasAddressTaken(/*Users=*/nullptr,
5529 /*IgnoreCallbackUses=*/false,
5530 /*IgnoreAssumeLikeCalls=*/true,
5531 /*IgnoreLLVMUsed=*/true))
5532 return ABITypeAlign;
5533
5534 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5535 return std::max(Align(16), ABITypeAlign);
5536}
5537
5538/// Helper for computing alignment of a device function byval parameter.
5540 const Function *F, Type *ArgTy, Align InitialAlign,
5541 const DataLayout &DL) const {
5542 Align ArgAlign = InitialAlign;
5543 // Try to increase alignment to enhance vectorization options.
5544 if (F)
5545 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5546
5547 // Old ptx versions have a bug. When PTX code takes address of
5548 // byval parameter with alignment < 4, ptxas generates code to
5549 // spill argument into memory. Alas on sm_50+ ptxas generates
5550 // SASS code that fails with misaligned access. To work around
5551 // the problem, make sure that we align byval parameters by at
5552 // least 4. This bug seems to be fixed at least starting from
5553 // ptxas > 9.0.
5554 // TODO: remove this after verifying the bug is not reproduced
5555 // on non-deprecated ptxas versions.
5557 ArgAlign = std::max(ArgAlign, Align(4));
5558
5559 return ArgAlign;
5560}
5561
5562// Helper for getting a function parameter name. Name is composed from
5563// its index and the function name. Negative index corresponds to special
5564// parameter (unsized array) used for passing variable arguments.
5566 int Idx) const {
5567 std::string ParamName;
5568 raw_string_ostream ParamStr(ParamName);
5569
5570 ParamStr << getTargetMachine().getSymbol(F)->getName();
5571 if (Idx < 0)
5572 ParamStr << "_vararg";
5573 else
5574 ParamStr << "_param_" << Idx;
5575
5576 return ParamName;
5577}
5578
5579/// isLegalAddressingMode - Return true if the addressing mode represented
5580/// by AM is legal for this target, for a load/store of the specified type.
5581/// Used to guide target specific optimizations, like loop strength reduction
5582/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5583/// (CodeGenPrepare.cpp)
5585 const AddrMode &AM, Type *Ty,
5586 unsigned AS, Instruction *I) const {
5587 // AddrMode - This represents an addressing mode of:
5588 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5589 //
5590 // The legal address modes are
5591 // - [avar]
5592 // - [areg]
5593 // - [areg+immoff]
5594 // - [immAddr]
5595
5596 // immoff must fit in a signed 32-bit int
5597 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5598 return false;
5599
5600 if (AM.BaseGV)
5601 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5602
5603 switch (AM.Scale) {
5604 case 0: // "r", "r+i" or "i" is allowed
5605 break;
5606 case 1:
5607 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5608 return false;
5609 // Otherwise we have r+i.
5610 break;
5611 default:
5612 // No scale > 1 is allowed
5613 return false;
5614 }
5615 return true;
5616}
5617
5618//===----------------------------------------------------------------------===//
5619// NVPTX Inline Assembly Support
5620//===----------------------------------------------------------------------===//
5621
5622/// getConstraintType - Given a constraint letter, return the type of
5623/// constraint it is for this target.
5626 if (Constraint.size() == 1) {
5627 switch (Constraint[0]) {
5628 default:
5629 break;
5630 case 'b':
5631 case 'r':
5632 case 'h':
5633 case 'c':
5634 case 'l':
5635 case 'f':
5636 case 'd':
5637 case 'q':
5638 case '0':
5639 case 'N':
5640 return C_RegisterClass;
5641 }
5642 }
5643 return TargetLowering::getConstraintType(Constraint);
5644}
5645
5646std::pair<unsigned, const TargetRegisterClass *>
5648 StringRef Constraint,
5649 MVT VT) const {
5650 if (Constraint.size() == 1) {
5651 switch (Constraint[0]) {
5652 case 'b':
5653 return std::make_pair(0U, &NVPTX::B1RegClass);
5654 case 'c':
5655 case 'h':
5656 return std::make_pair(0U, &NVPTX::B16RegClass);
5657 case 'r':
5658 case 'f':
5659 return std::make_pair(0U, &NVPTX::B32RegClass);
5660 case 'l':
5661 case 'N':
5662 case 'd':
5663 return std::make_pair(0U, &NVPTX::B64RegClass);
5664 case 'q': {
5665 if (STI.getSmVersion() < 70)
5666 report_fatal_error("Inline asm with 128 bit operands is only "
5667 "supported for sm_70 and higher!");
5668 return std::make_pair(0U, &NVPTX::B128RegClass);
5669 }
5670 }
5671 }
5672 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5673}
5674
5675//===----------------------------------------------------------------------===//
5676// NVPTX DAG Combining
5677//===----------------------------------------------------------------------===//
5678
5680 CodeGenOptLevel OptLevel) const {
5681 // Always honor command-line argument
5682 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5683 return FMAContractLevelOpt > 0;
5684
5685 // Do not contract if we're not optimizing the code.
5686 if (OptLevel == CodeGenOptLevel::None)
5687 return false;
5688
5689 // Honor TargetOptions flags that explicitly say fusion is okay.
5691 return true;
5692
5693 return false;
5694}
5695
5696static bool isConstZero(const SDValue &Operand) {
5697 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5698 return Const && Const->getZExtValue() == 0;
5699}
5700
5701/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5702/// operands N0 and N1. This is a helper for PerformADDCombine that is
5703/// called with the default operands, and if that fails, with commuted
5704/// operands.
5705static SDValue
5708 EVT VT = N0.getValueType();
5709
5710 // Since integer multiply-add costs the same as integer multiply
5711 // but is more costly than integer add, do the fusion only when
5712 // the mul is only used in the add.
5713 // TODO: this may not be true for later architectures, consider relaxing this
5714 if (!N0.getNode()->hasOneUse())
5715 return SDValue();
5716
5717 // fold (add (select cond, 0, (mul a, b)), c)
5718 // -> (select cond, c, (add (mul a, b), c))
5719 //
5720 if (N0.getOpcode() == ISD::SELECT) {
5721 unsigned ZeroOpNum;
5722 if (isConstZero(N0->getOperand(1)))
5723 ZeroOpNum = 1;
5724 else if (isConstZero(N0->getOperand(2)))
5725 ZeroOpNum = 2;
5726 else
5727 return SDValue();
5728
5729 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5730 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5731 return SDValue();
5732
5733 SDLoc DL(N);
5734 SDValue Mul =
5735 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5736 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5737 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5738 ((ZeroOpNum == 1) ? N1 : MAD),
5739 ((ZeroOpNum == 1) ? MAD : N1));
5740 }
5741
5742 return SDValue();
5743}
5744
5745static SDValue
5748 CodeGenOptLevel OptLevel) {
5749 EVT VT = N0.getValueType();
5750 if (N0.getOpcode() == ISD::FMUL) {
5751 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5752 &DCI.DAG.getTargetLoweringInfo());
5753 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5754 (N->getFlags().hasAllowContract() &&
5755 N0->getFlags().hasAllowContract())))
5756 return SDValue();
5757
5758 // For floating point:
5759 // Do the fusion only when the mul has less than 5 uses and all
5760 // are add.
5761 // The heuristic is that if a use is not an add, then that use
5762 // cannot be fused into fma, therefore mul is still needed anyway.
5763 // If there are more than 4 uses, even if they are all add, fusing
5764 // them will increase register pressue.
5765 //
5766 int numUses = 0;
5767 int nonAddCount = 0;
5768 for (const SDNode *User : N0.getNode()->users()) {
5769 numUses++;
5770 if (User->getOpcode() != ISD::FADD)
5771 ++nonAddCount;
5772 if (numUses >= 5)
5773 return SDValue();
5774 }
5775 if (nonAddCount) {
5776 int orderNo = N->getIROrder();
5777 int orderNo2 = N0.getNode()->getIROrder();
5778 // simple heuristics here for considering potential register
5779 // pressure, the logics here is that the differnce are used
5780 // to measure the distance between def and use, the longer distance
5781 // more likely cause register pressure.
5782 if (orderNo - orderNo2 < 500)
5783 return SDValue();
5784
5785 // Now, check if at least one of the FMUL's operands is live beyond the
5786 // node N, which guarantees that the FMA will not increase register
5787 // pressure at node N.
5788 bool opIsLive = false;
5789 const SDNode *left = N0.getOperand(0).getNode();
5790 const SDNode *right = N0.getOperand(1).getNode();
5791
5792 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5793 opIsLive = true;
5794
5795 if (!opIsLive)
5796 for (const SDNode *User : left->users()) {
5797 int orderNo3 = User->getIROrder();
5798 if (orderNo3 > orderNo) {
5799 opIsLive = true;
5800 break;
5801 }
5802 }
5803
5804 if (!opIsLive)
5805 for (const SDNode *User : right->users()) {
5806 int orderNo3 = User->getIROrder();
5807 if (orderNo3 > orderNo) {
5808 opIsLive = true;
5809 break;
5810 }
5811 }
5812
5813 if (!opIsLive)
5814 return SDValue();
5815 }
5816
5817 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5818 N0.getOperand(1), N1);
5819 }
5820
5821 return SDValue();
5822}
5823
5824/// Fold unpacking movs into a load by increasing the number of return values.
5825///
5826/// ex:
5827/// L: v2f16,ch = load <p>
5828/// a: f16 = extractelt L:0, 0
5829/// b: f16 = extractelt L:0, 1
5830/// use(a, b)
5831///
5832/// ...is turned into...
5833///
5834/// L: f16,f16,ch = LoadV2 <p>
5835/// use(L:0, L:1)
5836static SDValue
5838 // Don't run this optimization before the legalizer
5839 if (!DCI.isAfterLegalizeDAG())
5840 return SDValue();
5841
5842 EVT ElementVT = N->getValueType(0);
5843 // Avoid non-packed types and v4i8
5844 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5845 return SDValue();
5846
5847 // Check whether all outputs are either used by an extractelt or are
5848 // glue/chain nodes
5849 if (!all_of(N->uses(), [&](SDUse &U) {
5850 // Skip glue, chain nodes
5851 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5852 return true;
5853 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5854 if (N->getOpcode() != ISD::LOAD)
5855 return true;
5856 // Since this is an ISD::LOAD, check all extractelts are used. If
5857 // any are not used, we don't want to defeat another optimization that
5858 // will narrow the load.
5859 //
5860 // For example:
5861 //
5862 // L: v2f16,ch = load <p>
5863 // e0: f16 = extractelt L:0, 0
5864 // e1: f16 = extractelt L:0, 1 <-- unused
5865 // store e0
5866 //
5867 // Can be optimized by DAGCombiner to:
5868 //
5869 // L: f16,ch = load <p>
5870 // store L:0
5871 return !U.getUser()->use_empty();
5872 }
5873
5874 // Otherwise, this use prevents us from splitting a value.
5875 return false;
5876 }))
5877 return SDValue();
5878
5879 auto *LD = cast<MemSDNode>(N);
5880 SDLoc DL(LD);
5881
5882 // the new opcode after we double the number of operands
5883 unsigned Opcode;
5884 SmallVector<SDValue> Operands(LD->ops());
5885 unsigned OldNumOutputs; // non-glue, non-chain outputs
5886 switch (LD->getOpcode()) {
5887 case ISD::LOAD:
5888 OldNumOutputs = 1;
5889 // Any packed type is legal, so the legalizer will not have lowered
5890 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5891 // here.
5892 Opcode = NVPTXISD::LoadV2;
5893 // append a "full" used bytes mask operand right before the extension type
5894 // operand, signifying that all bytes are used.
5895 Operands.push_back(DCI.DAG.getConstant(UINT32_MAX, DL, MVT::i32));
5896 Operands.push_back(DCI.DAG.getIntPtrConstant(
5897 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5898 break;
5899 case NVPTXISD::LoadV2:
5900 OldNumOutputs = 2;
5901 Opcode = NVPTXISD::LoadV4;
5902 break;
5903 case NVPTXISD::LoadV4:
5904 // V8 is only supported for f32/i32. Don't forget, we're not changing the
5905 // load size here. This is already a 256-bit load.
5906 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
5907 return SDValue();
5908 OldNumOutputs = 4;
5909 Opcode = NVPTXISD::LoadV8;
5910 break;
5911 case NVPTXISD::LoadV8:
5912 // PTX doesn't support the next doubling of outputs
5913 return SDValue();
5914 }
5915
5916 // the non-glue, non-chain outputs in the new load
5917 const unsigned NewNumOutputs = OldNumOutputs * 2;
5918 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5919 // add remaining chain and glue values
5920 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5921
5922 // Create the new load
5923 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5924 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5925 LD->getMemOperand());
5926
5927 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5928 // the outputs the same. These nodes will be optimized away in later
5929 // DAGCombiner iterations.
5931 for (unsigned I : seq(OldNumOutputs))
5932 Results.push_back(DCI.DAG.getBuildVector(
5933 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5934 // Add remaining chain and glue nodes
5935 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5936 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5937
5938 return DCI.DAG.getMergeValues(Results, DL);
5939}
5940
5941/// Fold packing movs into a store.
5942///
5943/// ex:
5944/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5945/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5946/// StoreV2 v1, v2
5947///
5948/// ...is turned into...
5949///
5950/// StoreV4 a, b, c, d
5953 unsigned Front, unsigned Back) {
5954 // We want to run this as late as possible since other optimizations may
5955 // eliminate the BUILD_VECTORs.
5956 if (!DCI.isAfterLegalizeDAG())
5957 return SDValue();
5958
5959 // Get the type of the operands being stored.
5960 EVT ElementVT = N->getOperand(Front).getValueType();
5961
5962 // Avoid non-packed types and v4i8
5963 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5964 return SDValue();
5965
5966 auto *ST = cast<MemSDNode>(N);
5967
5968 // The new opcode after we double the number of operands.
5969 unsigned Opcode;
5970 switch (N->getOpcode()) {
5971 case ISD::STORE:
5972 // Any packed type is legal, so the legalizer will not have lowered
5973 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5974 // it here.
5975 Opcode = NVPTXISD::StoreV2;
5976 break;
5977 case NVPTXISD::StoreV2:
5978 Opcode = NVPTXISD::StoreV4;
5979 break;
5980 case NVPTXISD::StoreV4:
5981 // V8 is only supported for f32/i32. Don't forget, we're not changing the
5982 // store size here. This is already a 256-bit store.
5983 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
5984 return SDValue();
5985 Opcode = NVPTXISD::StoreV8;
5986 break;
5987 case NVPTXISD::StoreV8:
5988 // PTX doesn't support the next doubling of operands
5989 return SDValue();
5990 default:
5991 llvm_unreachable("Unhandled store opcode");
5992 }
5993
5994 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5995 // their elements.
5996 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5997 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5998 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5999 return SDValue();
6000
6001 // If the operand has multiple uses, this optimization can increase register
6002 // pressure.
6003 if (!BV.hasOneUse())
6004 return SDValue();
6005
6006 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
6007 // any signs they may be folded by some other pattern or rule.
6008 for (SDValue Op : BV->ops()) {
6009 // Peek through bitcasts
6010 if (Op.getOpcode() == ISD::BITCAST)
6011 Op = Op.getOperand(0);
6012
6013 // This may be folded into a PRMT.
6014 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
6015 Op->getOperand(0).getValueType() == MVT::i32)
6016 return SDValue();
6017
6018 // This may be folded into cvt.bf16x2
6019 if (Op.getOpcode() == ISD::FP_ROUND)
6020 return SDValue();
6021 }
6022 Operands.append({BV.getOperand(0), BV.getOperand(1)});
6023 }
6024 Operands.append(N->op_end() - Back, N->op_end());
6025
6026 // Now we replace the store
6027 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
6028 ST->getMemoryVT(), ST->getMemOperand());
6029}
6030
6032 const NVPTXSubtarget &STI) {
6033
6034 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
6035 // Here is our chance to custom lower a store with a non-simple type.
6036 // Unfortunately, we can't do this in the legalizer because there is no
6037 // way to setOperationAction for an non-simple type.
6039 if (!ST->getValue().getValueType().isSimple())
6040 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
6041 }
6042
6043 return combinePackingMovIntoStore(N, DCI, 1, 2);
6044}
6045
6047 const NVPTXSubtarget &STI) {
6048 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
6049 // Here is our chance to custom lower a load with a non-simple type.
6050 // Unfortunately, we can't do this in the legalizer because there is no
6051 // way to setOperationAction for an non-simple type.
6052 if (!N->getValueType(0).isSimple())
6053 return lowerLoadVector(N, DCI.DAG, STI);
6054 }
6055
6056 return combineUnpackingMovIntoLoad(N, DCI);
6057}
6058
6059/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
6060///
6063 CodeGenOptLevel OptLevel) {
6064 if (OptLevel == CodeGenOptLevel::None)
6065 return SDValue();
6066
6067 SDValue N0 = N->getOperand(0);
6068 SDValue N1 = N->getOperand(1);
6069
6070 // Skip non-integer, non-scalar case
6071 EVT VT = N0.getValueType();
6072 if (VT.isVector() || VT != MVT::i32)
6073 return SDValue();
6074
6075 // First try with the default operand order.
6076 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
6077 return Result;
6078
6079 // If that didn't work, try again with the operands commuted.
6080 return PerformADDCombineWithOperands(N, N1, N0, DCI);
6081}
6082
6083/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
6084///
6087 CodeGenOptLevel OptLevel) {
6088 SDValue N0 = N->getOperand(0);
6089 SDValue N1 = N->getOperand(1);
6090
6091 EVT VT = N0.getValueType();
6092 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
6093 return SDValue();
6094
6095 // First try with the default operand order.
6096 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
6097 return Result;
6098
6099 // If that didn't work, try again with the operands commuted.
6100 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
6101}
6102
6103/// Get 3-input version of a 2-input min/max opcode
6104static unsigned getMinMax3Opcode(unsigned MinMax2Opcode) {
6105 switch (MinMax2Opcode) {
6106 case ISD::FMAXNUM:
6107 case ISD::FMAXIMUMNUM:
6108 return NVPTXISD::FMAXNUM3;
6109 case ISD::FMINNUM:
6110 case ISD::FMINIMUMNUM:
6111 return NVPTXISD::FMINNUM3;
6112 case ISD::FMAXIMUM:
6113 return NVPTXISD::FMAXIMUM3;
6114 case ISD::FMINIMUM:
6115 return NVPTXISD::FMINIMUM3;
6116 default:
6117 llvm_unreachable("Invalid 2-input min/max opcode");
6118 }
6119}
6120
6121/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
6122/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
6125 unsigned PTXVersion, unsigned SmVersion) {
6126
6127 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
6128 EVT VT = N->getValueType(0);
6129 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
6130 return SDValue();
6131
6132 SDValue Op0 = N->getOperand(0);
6133 SDValue Op1 = N->getOperand(1);
6134 unsigned MinMaxOp2 = N->getOpcode();
6135 unsigned MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
6136
6137 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
6138 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
6139 SDValue A = Op0.getOperand(0);
6140 SDValue B = Op0.getOperand(1);
6141 SDValue C = Op1;
6142 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6143 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
6144 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
6145 SDValue A = Op0;
6146 SDValue B = Op1.getOperand(0);
6147 SDValue C = Op1.getOperand(1);
6148 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6149 }
6150 return SDValue();
6151}
6152
6155 CodeGenOptLevel OptLevel) {
6156 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
6157
6158 // Don't do anything at less than -O2.
6159 if (OptLevel < CodeGenOptLevel::Default)
6160 return SDValue();
6161
6162 SelectionDAG &DAG = DCI.DAG;
6163 SDLoc DL(N);
6164 EVT VT = N->getValueType(0);
6165 bool IsSigned = N->getOpcode() == ISD::SREM;
6166 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
6167
6168 const SDValue &Num = N->getOperand(0);
6169 const SDValue &Den = N->getOperand(1);
6170
6171 for (const SDNode *U : Num->users()) {
6172 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
6173 U->getOperand(1) == Den) {
6174 // Num % Den -> Num - (Num / Den) * Den
6175 return DAG.getNode(ISD::SUB, DL, VT, Num,
6176 DAG.getNode(ISD::MUL, DL, VT,
6177 DAG.getNode(DivOpc, DL, VT, Num, Den),
6178 Den));
6179 }
6180 }
6181 return SDValue();
6182}
6183
6184// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
6186 CodeGenOptLevel OptLevel) {
6187 if (OptLevel == CodeGenOptLevel::None)
6188 return SDValue();
6189
6190 SDValue Op = N->getOperand(0);
6191 if (!Op.hasOneUse())
6192 return SDValue();
6193 EVT ToVT = N->getValueType(0);
6194 EVT FromVT = Op.getValueType();
6195 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
6196 (ToVT == MVT::i64 && FromVT == MVT::i32)))
6197 return SDValue();
6198 if (!(Op.getOpcode() == ISD::MUL ||
6199 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
6200 return SDValue();
6201
6202 SDLoc DL(N);
6203 unsigned ExtOpcode = N->getOpcode();
6204 unsigned Opcode = 0;
6205 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
6206 Opcode = NVPTXISD::MUL_WIDE_SIGNED;
6207 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
6208 Opcode = NVPTXISD::MUL_WIDE_UNSIGNED;
6209 else
6210 return SDValue();
6211 SDValue RHS = Op.getOperand(1);
6212 if (Op.getOpcode() == ISD::SHL) {
6213 const auto ShiftAmt = Op.getConstantOperandVal(1);
6214 const auto MulVal = APInt(FromVT.getSizeInBits(), 1) << ShiftAmt;
6215 RHS = DCI.DAG.getConstant(MulVal, DL, FromVT);
6216 }
6217 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
6218}
6219
6225
6226/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
6227/// that can be demoted to \p OptSize bits without loss of information. The
6228/// signedness of the operand, if determinable, is placed in \p S.
6230 unsigned OptSize,
6231 OperandSignedness &S) {
6232 S = Unknown;
6233
6234 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
6235 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
6236 EVT OrigVT = Op.getOperand(0).getValueType();
6237 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6238 S = Signed;
6239 return true;
6240 }
6241 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
6242 EVT OrigVT = Op.getOperand(0).getValueType();
6243 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6244 S = Unsigned;
6245 return true;
6246 }
6247 }
6248
6249 return false;
6250}
6251
6252/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
6253/// be demoted to \p OptSize bits without loss of information. If the operands
6254/// contain a constant, it should appear as the RHS operand. The signedness of
6255/// the operands is placed in \p IsSigned.
6257 unsigned OptSize,
6258 bool &IsSigned) {
6259 OperandSignedness LHSSign;
6260
6261 // The LHS operand must be a demotable op
6262 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
6263 return false;
6264
6265 // We should have been able to determine the signedness from the LHS
6266 if (LHSSign == Unknown)
6267 return false;
6268
6269 IsSigned = (LHSSign == Signed);
6270
6271 // The RHS can be a demotable op or a constant
6273 const APInt &Val = CI->getAPIntValue();
6274 if (LHSSign == Unsigned) {
6275 return Val.isIntN(OptSize);
6276 } else {
6277 return Val.isSignedIntN(OptSize);
6278 }
6279 } else {
6280 OperandSignedness RHSSign;
6281 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
6282 return false;
6283
6284 return LHSSign == RHSSign;
6285 }
6286}
6287
6288/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
6289/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
6290/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
6291/// amount.
6294 EVT MulType = N->getValueType(0);
6295 if (MulType != MVT::i32 && MulType != MVT::i64) {
6296 return SDValue();
6297 }
6298
6299 SDLoc DL(N);
6300 unsigned OptSize = MulType.getSizeInBits() >> 1;
6301 SDValue LHS = N->getOperand(0);
6302 SDValue RHS = N->getOperand(1);
6303
6304 // Canonicalize the multiply so the constant (if any) is on the right
6305 if (N->getOpcode() == ISD::MUL) {
6306 if (isa<ConstantSDNode>(LHS)) {
6307 std::swap(LHS, RHS);
6308 }
6309 }
6310
6311 // If we have a SHL, determine the actual multiply amount
6312 if (N->getOpcode() == ISD::SHL) {
6314 if (!ShlRHS) {
6315 return SDValue();
6316 }
6317
6318 APInt ShiftAmt = ShlRHS->getAPIntValue();
6319 unsigned BitWidth = MulType.getSizeInBits();
6320 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
6321 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
6322 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
6323 } else {
6324 return SDValue();
6325 }
6326 }
6327
6328 bool Signed;
6329 // Verify that our operands are demotable
6330 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
6331 return SDValue();
6332 }
6333
6334 EVT DemotedVT;
6335 if (MulType == MVT::i32) {
6336 DemotedVT = MVT::i16;
6337 } else {
6338 DemotedVT = MVT::i32;
6339 }
6340
6341 // Truncate the operands to the correct size. Note that these are just for
6342 // type consistency and will (likely) be eliminated in later phases.
6343 SDValue TruncLHS =
6344 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
6345 SDValue TruncRHS =
6346 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
6347
6348 unsigned Opc;
6349 if (Signed) {
6350 Opc = NVPTXISD::MUL_WIDE_SIGNED;
6351 } else {
6352 Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
6353 }
6354
6355 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
6356}
6357
6358static bool isConstOne(const SDValue &Operand) {
6359 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
6360 return Const && Const->getZExtValue() == 1;
6361}
6362
6364 if (Add->getOpcode() != ISD::ADD)
6365 return SDValue();
6366
6367 if (isConstOne(Add->getOperand(0)))
6368 return Add->getOperand(1);
6369
6370 if (isConstOne(Add->getOperand(1)))
6371 return Add->getOperand(0);
6372
6373 return SDValue();
6374}
6375
6378
6380 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6381 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
6382 }
6383
6384 return SDValue();
6385}
6386
6388 SDLoc DL,
6390 if (Select->getOpcode() != ISD::SELECT)
6391 return SDValue();
6392
6393 SDValue Cond = Select->getOperand(0);
6394
6395 unsigned ConstOpNo;
6396 if (isConstOne(Select->getOperand(1)))
6397 ConstOpNo = 1;
6398 else if (isConstOne(Select->getOperand(2)))
6399 ConstOpNo = 2;
6400 else
6401 return SDValue();
6402
6403 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
6404
6405 // Do not combine if the resulting sequence is not obviously profitable.
6407 return SDValue();
6408
6409 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6410
6411 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
6412 (ConstOpNo == 1) ? X : NewMul,
6413 (ConstOpNo == 1) ? NewMul : X);
6414}
6415
6416static SDValue
6419
6420 EVT VT = N0.getValueType();
6421 if (VT.isVector())
6422 return SDValue();
6423
6424 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
6425 return SDValue();
6426
6427 SDLoc DL(N);
6428
6429 // (mul x, (add y, 1)) -> (add (mul x, y), x)
6430 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
6431 return Res;
6432 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
6433 return Res;
6434
6435 // (mul x, (select y, 1)) -> (select (mul x, y), x)
6436 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
6437 return Res;
6438 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
6439 return Res;
6440
6441 return SDValue();
6442}
6443
6444/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
6447 CodeGenOptLevel OptLevel) {
6448 if (OptLevel == CodeGenOptLevel::None)
6449 return SDValue();
6450
6451 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6452 return Ret;
6453
6454 SDValue N0 = N->getOperand(0);
6455 SDValue N1 = N->getOperand(1);
6456 return PerformMULCombineWithOperands(N, N0, N1, DCI);
6457}
6458
6459/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
6462 CodeGenOptLevel OptLevel) {
6463 if (OptLevel > CodeGenOptLevel::None) {
6464 // Try mul.wide combining at OptLevel > 0
6465 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6466 return Ret;
6467 }
6468
6469 return SDValue();
6470}
6471
6474 unsigned int SmVersion) {
6475 EVT CCType = N->getValueType(0);
6476 SDValue A = N->getOperand(0);
6477 SDValue B = N->getOperand(1);
6478
6479 EVT AType = A.getValueType();
6480 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
6481 return SDValue();
6482
6483 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
6484 return SDValue();
6485
6486 SDLoc DL(N);
6487 // setp.f16x2 returns two scalar predicates, which we need to
6488 // convert back to v2i1. The returned result will be scalarized by
6489 // the legalizer, but the comparison will remain a single vector
6490 // instruction.
6491 SDValue CCNode = DCI.DAG.getNode(
6492 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
6494 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
6495 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
6496 CCNode.getValue(1));
6497}
6498
6501 SDValue Vector = N->getOperand(0);
6502 if (Vector->getOpcode() == ISD::FREEZE)
6503 Vector = Vector->getOperand(0);
6504 SDLoc DL(N);
6505 EVT VectorVT = Vector.getValueType();
6506 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6507 IsPTXVectorType(VectorVT.getSimpleVT()))
6508 return SDValue(); // Native vector loads already combine nicely w/
6509 // extract_vector_elt.
6510 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6511 // we already handle them OK.
6512 if (VectorVT.getVectorNumElements() == 1 ||
6513 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6514 return SDValue();
6515
6516 // Don't mess with undef values as sra may be simplified to 0, not undef.
6517 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6518 return SDValue();
6519
6520 uint64_t VectorBits = VectorVT.getSizeInBits();
6521 // We only handle the types we can extract in-register.
6522 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6523 return SDValue();
6524
6525 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6526 // Index == 0 is handled by generic DAG combiner.
6527 if (!Index || Index->getZExtValue() == 0)
6528 return SDValue();
6529
6530 MVT IVT = MVT::getIntegerVT(VectorBits);
6531 EVT EltVT = VectorVT.getVectorElementType();
6532 EVT EltIVT = EltVT.changeTypeToInteger();
6533 uint64_t EltBits = EltVT.getScalarSizeInBits();
6534
6535 SDValue Result = DCI.DAG.getNode(
6536 ISD::TRUNCATE, DL, EltIVT,
6537 DCI.DAG.getNode(
6538 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6539 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6540
6541 // If element has non-integer type, bitcast it back to the expected type.
6542 if (EltVT != EltIVT)
6543 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6544 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6545 if (EltVT != N->getValueType(0))
6546 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6547
6548 return Result;
6549}
6550
6551/// Transform patterns like:
6552/// (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt))
6553/// (select (ult shift_amt, BitWidth), (srl/shl x, shift_amt), 0)
6554/// Into:
6555/// (NVPTXISD::SRL_CLAMP x, shift_amt) or (NVPTXISD::SHL_CLAMP x, shift_amt)
6556///
6557/// These patterns arise from C/C++ code like `shift >= 32 ? 0 : x >> shift`
6558/// which guards against undefined behavior. PTX shr/shl instructions clamp
6559/// shift amounts >= BitWidth to produce 0 for logical shifts, making the
6560/// guard redundant.
6561///
6562/// Note: We only handle SRL and SHL, not SRA, because arithmetic right
6563/// shifts could produce 0 or -1 when shift >= BitWidth.
6564/// Note: We don't handle uge or ule. These don't appear because of
6565/// canonicalization.
6568 if (!DCI.isAfterLegalizeDAG())
6569 return SDValue();
6570
6571 using namespace SDPatternMatch;
6572 unsigned BitWidth = N->getValueType(0).getSizeInBits();
6573 SDValue ShiftAmt, ShiftOp;
6574
6575 // Match logical shifts where the shift amount in the guard matches the shift
6576 // amount in the operation.
6577 auto LogicalShift =
6578 m_AllOf(m_Value(ShiftOp),
6579 m_AnyOf(m_Srl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt))),
6580 m_Shl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt)))));
6581
6582 // shift_amt > BitWidth-1 ? 0 : shift_op
6583 bool MatchedUGT =
6584 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6586 m_SpecificCondCode(ISD::SETUGT)),
6587 m_Zero(), LogicalShift));
6588 // shift_amt < BitWidth ? shift_op : 0
6589 bool MatchedULT =
6590 !MatchedUGT &&
6591 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6593 m_SpecificCondCode(ISD::SETULT)),
6594 LogicalShift, m_Zero()));
6595
6596 if (!MatchedUGT && !MatchedULT)
6597 return SDValue();
6598
6599 // Return a clamp shift operation, which has the same semantics as PTX shift.
6600 unsigned ClampOpc = ShiftOp.getOpcode() == ISD::SRL ? NVPTXISD::SRL_CLAMP
6601 : NVPTXISD::SHL_CLAMP;
6602 return DCI.DAG.getNode(ClampOpc, SDLoc(N), ShiftOp.getValueType(),
6603 ShiftOp.getOperand(0), ShiftOp.getOperand(1));
6604}
6605
6608 SDValue VA = N->getOperand(1);
6609 EVT VectorVT = VA.getValueType();
6610 if (VectorVT != MVT::v4i8)
6611 return SDValue();
6612
6613 // We need to split vselect into individual per-element operations Because we
6614 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6615 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6616 // to/from i16 normally used for i8 values.
6618 SDLoc DL(N);
6619 SDValue VCond = N->getOperand(0);
6620 SDValue VB = N->getOperand(2);
6621 for (int I = 0; I < 4; ++I) {
6622 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6623 DCI.DAG.getConstant(I, DL, MVT::i32));
6624 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6625 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6626 DCI.DAG.getConstant(I, DL, MVT::i32)),
6627 DL, MVT::i32);
6628 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6629 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6630 DCI.DAG.getConstant(I, DL, MVT::i32)),
6631 DL, MVT::i32);
6632 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6633 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6634 }
6635 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6636}
6637
6638static SDValue
6640 auto VT = N->getValueType(0);
6641 if (!DCI.isAfterLegalizeDAG() ||
6642 // only process v2*16 types
6643 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6644 VT.getVectorNumElements() == 2))
6645 return SDValue();
6646
6647 auto Op0 = N->getOperand(0);
6648 auto Op1 = N->getOperand(1);
6649
6650 // Start out by assuming we want to take the lower 2 bytes of each i32
6651 // operand.
6652 uint64_t Op0Bytes = 0x10;
6653 uint64_t Op1Bytes = 0x54;
6654
6655 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6656 {&Op1, &Op1Bytes}};
6657
6658 // Check that each operand is an i16, truncated from an i32 operand. We'll
6659 // select individual bytes from those original operands. Optionally, fold in a
6660 // shift right of that original operand.
6661 for (auto &[Op, OpBytes] : OpData) {
6662 // Eat up any bitcast
6663 if (Op->getOpcode() == ISD::BITCAST)
6664 *Op = Op->getOperand(0);
6665
6666 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6667 Op->getOperand(0).getValueType() == MVT::i32))
6668 return SDValue();
6669
6670 // If the truncate has multiple uses, this optimization can increase
6671 // register pressure
6672 if (!Op->hasOneUse())
6673 return SDValue();
6674
6675 *Op = Op->getOperand(0);
6676
6677 // Optionally, fold in a shift-right of the original operand and let permute
6678 // pick the two higher bytes of the original value directly.
6679 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6680 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6681 // Shift the PRMT byte selector to pick upper bytes from each respective
6682 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6683 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6684 "PRMT selector values out of range");
6685 *OpBytes += 0x22;
6686 *Op = Op->getOperand(0);
6687 }
6688 }
6689 }
6690
6691 SDLoc DL(N);
6692 auto &DAG = DCI.DAG;
6693
6694 auto PRMT =
6695 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6696 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6697 return DAG.getBitcast(VT, PRMT);
6698}
6699
6702 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6703
6704 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6705 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6706
6707 // Fold asc[B -> A](asc[A -> B](x)) -> x
6708 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6709 return ASCN2->getOperand(0);
6710 }
6711
6712 return SDValue();
6713}
6714
6715// Given a constant selector value and a prmt mode, return the selector value
6716// normalized to the generic prmt mode. See the PTX ISA documentation for more
6717// details:
6718// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6719static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6720 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6721
6723 return Selector;
6724
6725 const unsigned V = Selector.trunc(2).getZExtValue();
6726
6727 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6728 unsigned S3) {
6729 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6730 };
6731
6732 switch (Mode) {
6734 return GetSelector(V, V + 1, V + 2, V + 3);
6736 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6738 return GetSelector(V, V, V, V);
6740 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6742 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6744 unsigned V1 = (V & 1) << 1;
6745 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6746 }
6747 default:
6748 llvm_unreachable("Invalid PRMT mode");
6749 }
6750}
6751
6752static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6753 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6754 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6755 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6756 APInt BitField = B.concat(A);
6757 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6758 APInt Result(32, 0);
6759 for (unsigned I : llvm::seq(4U)) {
6760 APInt Sel = SelectorVal.extractBits(4, I * 4);
6761 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6762 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6763 APInt Byte = BitField.extractBits(8, Idx * 8);
6764 if (Sign)
6765 Byte = Byte.ashr(8);
6766 Result.insertBits(Byte, I * 8);
6767 }
6768 return Result;
6769}
6770
6772 CodeGenOptLevel OptLevel) {
6773 if (OptLevel == CodeGenOptLevel::None)
6774 return SDValue();
6775
6776 // Constant fold PRMT
6777 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6778 isa<ConstantSDNode>(N->getOperand(1)) &&
6779 isa<ConstantSDNode>(N->getOperand(2)))
6780 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6781 N->getConstantOperandAPInt(1),
6782 N->getConstantOperandAPInt(2),
6783 N->getConstantOperandVal(3)),
6784 SDLoc(N), N->getValueType(0));
6785 return SDValue();
6786}
6787
6788// During call lowering we wrap the return values in a ProxyReg node which
6789// depend on the chain value produced by the completed call. This ensures that
6790// the full call is emitted in cases where libcalls are used to legalize
6791// operations. To improve the functioning of other DAG combines we pull all
6792// operations we can through one of these nodes, ensuring that the ProxyReg
6793// directly wraps a load. That is:
6794//
6795// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6796//
6799 switch (R.getOpcode()) {
6800 case ISD::TRUNCATE:
6801 case ISD::ANY_EXTEND:
6802 case ISD::SIGN_EXTEND:
6803 case ISD::ZERO_EXTEND:
6804 case ISD::BITCAST: {
6805 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6806 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6807 return SDValue();
6808 }
6809 case ISD::SHL:
6810 case ISD::SRL:
6811 case ISD::SRA:
6812 case ISD::OR: {
6813 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6814 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6815 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6816 return SDValue();
6817 }
6818 case ISD::Constant:
6819 return R;
6820 case ISD::LOAD:
6821 case NVPTXISD::LoadV2:
6822 case NVPTXISD::LoadV4: {
6823 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6824 {Chain, R});
6825 }
6826 case ISD::BUILD_VECTOR: {
6827 if (DCI.isBeforeLegalize())
6828 return SDValue();
6829
6831 for (auto &Op : R->ops()) {
6832 SDValue V = sinkProxyReg(Op, Chain, DCI);
6833 if (!V)
6834 return SDValue();
6835 Ops.push_back(V);
6836 }
6837 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6838 }
6840 if (DCI.isBeforeLegalize())
6841 return SDValue();
6842
6843 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6845 R.getValueType(), V, R.getOperand(1));
6846 return SDValue();
6847 }
6848 default:
6849 return SDValue();
6850 }
6851}
6852
6853static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID) {
6854 switch (AddIntrinsicID) {
6855 default:
6856 break;
6857 case Intrinsic::nvvm_add_rn_sat_f16:
6858 case Intrinsic::nvvm_add_rn_sat_v2f16:
6859 return NVPTXISD::SUB_RN_SAT;
6860 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
6861 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
6862 return NVPTXISD::SUB_RN_FTZ_SAT;
6863 }
6864 llvm_unreachable("Invalid F16 add intrinsic");
6865}
6866
6868 Intrinsic::ID AddIntrinsicID) {
6869 SDValue Op1 = N->getOperand(1);
6870 SDValue Op2 = N->getOperand(2);
6871
6872 SDValue SubOp1, SubOp2;
6873
6874 if (Op1.getOpcode() == ISD::FNEG) {
6875 SubOp1 = Op2;
6876 SubOp2 = Op1.getOperand(0);
6877 } else if (Op2.getOpcode() == ISD::FNEG) {
6878 SubOp1 = Op1;
6879 SubOp2 = Op2.getOperand(0);
6880 } else {
6881 return SDValue();
6882 }
6883
6884 SDLoc DL(N);
6885 return DAG.getNode(getF16SubOpc(AddIntrinsicID), DL, N->getValueType(0),
6886 SubOp1, SubOp2);
6887}
6888
6891 const NVPTXSubtarget &STI) {
6892 unsigned IID = N->getConstantOperandVal(0);
6893
6894 switch (IID) {
6895 default:
6896 break;
6897 case Intrinsic::nvvm_add_rn_sat_f16:
6898 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
6899 case Intrinsic::nvvm_add_rn_sat_v2f16:
6900 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
6901 return combineF16AddWithNeg(N, DCI.DAG, IID);
6902 }
6903 return SDValue();
6904}
6905
6908
6909 SDValue Chain = N->getOperand(0);
6910 SDValue Reg = N->getOperand(1);
6911
6912 // If the ProxyReg is not wrapping a load, try to pull the operations through
6913 // the ProxyReg.
6914 if (Reg.getOpcode() != ISD::LOAD) {
6915 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
6916 return V;
6917 }
6918
6919 return SDValue();
6920}
6921
6922SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6923 DAGCombinerInfo &DCI) const {
6925 switch (N->getOpcode()) {
6926 default:
6927 break;
6928 case ISD::ADD:
6929 return PerformADDCombine(N, DCI, OptLevel);
6930 case ISD::ADDRSPACECAST:
6931 return combineADDRSPACECAST(N, DCI);
6932 case ISD::SIGN_EXTEND:
6933 case ISD::ZERO_EXTEND:
6934 return combineMulWide(N, DCI, OptLevel);
6935 case ISD::BUILD_VECTOR:
6936 return PerformBUILD_VECTORCombine(N, DCI);
6938 return PerformEXTRACTCombine(N, DCI);
6939 case ISD::FADD:
6940 return PerformFADDCombine(N, DCI, OptLevel);
6941 case ISD::FMAXNUM:
6942 case ISD::FMINNUM:
6943 case ISD::FMAXIMUM:
6944 case ISD::FMINIMUM:
6945 case ISD::FMAXIMUMNUM:
6946 case ISD::FMINIMUMNUM:
6947 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
6948 STI.getSmVersion());
6949 case ISD::LOAD:
6950 case NVPTXISD::LoadV2:
6951 case NVPTXISD::LoadV4:
6952 return combineLOAD(N, DCI, STI);
6953 case ISD::MUL:
6954 return PerformMULCombine(N, DCI, OptLevel);
6955 case NVPTXISD::PRMT:
6956 return combinePRMT(N, DCI, OptLevel);
6957 case NVPTXISD::ProxyReg:
6958 return combineProxyReg(N, DCI);
6959 case ISD::SETCC:
6960 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6961 case ISD::SHL:
6962 return PerformSHLCombine(N, DCI, OptLevel);
6963 case ISD::SREM:
6964 case ISD::UREM:
6965 return PerformREMCombine(N, DCI, OptLevel);
6966 case ISD::STORE:
6967 case NVPTXISD::StoreV2:
6968 case NVPTXISD::StoreV4:
6969 return combineSTORE(N, DCI, STI);
6970 case ISD::SELECT:
6971 return PerformSELECTShiftCombine(N, DCI);
6972 case ISD::VSELECT:
6973 return PerformVSELECTCombine(N, DCI);
6975 return combineIntrinsicWOChain(N, DCI, STI);
6976 }
6977 return SDValue();
6978}
6979
6982 // Handle bitcasting to v2i8 without hitting the default promotion
6983 // strategy which goes through stack memory.
6984 SDValue Op(Node, 0);
6985 EVT ToVT = Op->getValueType(0);
6986 if (ToVT != MVT::v2i8) {
6987 return;
6988 }
6989
6990 // Bitcast to i16 and unpack elements into a vector
6991 SDLoc DL(Node);
6992 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6993 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6994 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6995 SDValue Vec1 =
6996 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6997 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6998 Results.push_back(
6999 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
7000}
7001
7004 SDValue Chain = N->getOperand(0);
7005 SDValue Intrin = N->getOperand(1);
7006 SDLoc DL(N);
7007
7008 // Get the intrinsic ID
7009 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
7010 switch (IntrinNo) {
7011 default:
7012 return;
7013 case Intrinsic::nvvm_ldu_global_i:
7014 case Intrinsic::nvvm_ldu_global_f:
7015 case Intrinsic::nvvm_ldu_global_p: {
7016 EVT ResVT = N->getValueType(0);
7017
7018 if (ResVT.isVector()) {
7019 // Vector LDG/LDU
7020
7021 unsigned NumElts = ResVT.getVectorNumElements();
7022 EVT EltVT = ResVT.getVectorElementType();
7023
7024 // Since LDU/LDG are target nodes, we cannot rely on DAG type
7025 // legalization.
7026 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
7027 // loaded type to i16 and propagate the "real" type as the memory type.
7028 bool NeedTrunc = false;
7029 if (EltVT.getSizeInBits() < 16) {
7030 EltVT = MVT::i16;
7031 NeedTrunc = true;
7032 }
7033
7034 unsigned Opcode = 0;
7035 SDVTList LdResVTs;
7036
7037 switch (NumElts) {
7038 default:
7039 return;
7040 case 2:
7041 Opcode = NVPTXISD::LDUV2;
7042 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
7043 break;
7044 case 4: {
7045 Opcode = NVPTXISD::LDUV4;
7046 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
7047 LdResVTs = DAG.getVTList(ListVTs);
7048 break;
7049 }
7050 }
7051
7052 SmallVector<SDValue, 8> OtherOps;
7053
7054 // Copy regular operands
7055
7056 OtherOps.push_back(Chain); // Chain
7057 // Skip operand 1 (intrinsic ID)
7058 // Others
7059 OtherOps.append(N->op_begin() + 2, N->op_end());
7060
7062
7063 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
7064 MemSD->getMemoryVT(),
7065 MemSD->getMemOperand());
7066
7067 SmallVector<SDValue, 4> ScalarRes;
7068
7069 for (unsigned i = 0; i < NumElts; ++i) {
7070 SDValue Res = NewLD.getValue(i);
7071 if (NeedTrunc)
7072 Res =
7073 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
7074 ScalarRes.push_back(Res);
7075 }
7076
7077 SDValue LoadChain = NewLD.getValue(NumElts);
7078
7079 SDValue BuildVec =
7080 DAG.getBuildVector(ResVT, DL, ScalarRes);
7081
7082 Results.push_back(BuildVec);
7083 Results.push_back(LoadChain);
7084 } else {
7085 // i8 LDG/LDU
7086 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
7087 "Custom handling of non-i8 ldu/ldg?");
7088
7089 // Just copy all operands as-is
7091
7092 // Force output to i16
7093 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
7094
7096
7097 // We make sure the memory type is i8, which will be used during isel
7098 // to select the proper instruction.
7099 SDValue NewLD =
7101 MVT::i8, MemSD->getMemOperand());
7102
7103 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
7104 NewLD.getValue(0)));
7105 Results.push_back(NewLD.getValue(1));
7106 }
7107 return;
7108 }
7109
7110 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
7111 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
7112 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
7113 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
7114 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
7115 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
7116 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
7117 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
7118 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
7119 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
7120 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
7121 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
7122 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
7123 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
7124 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
7125 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
7126 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
7127 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
7128 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
7129 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
7130 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
7131 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
7132 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
7133 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
7134 if (auto Res = lowerTcgen05Ld(N, DAG)) {
7135 Results.push_back(Res->first);
7136 Results.push_back(Res->second);
7137 }
7138 return;
7139
7140 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
7141 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
7142 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
7143 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
7144 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
7145 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
7146 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
7147 Results.push_back(Res->first);
7148 Results.push_back(Res->second);
7149 }
7150 return;
7151
7152 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
7153 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
7154 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
7155 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
7156 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
7157 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
7158 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
7159 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
7160 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
7161 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
7162 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
7163 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
7164 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32:
7165 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32:
7166 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32:
7167 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32:
7168 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32:
7169 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32:
7170 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32:
7171 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32:
7172 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32:
7173 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32:
7174 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32:
7175 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32:
7176 if (auto Res = lowerTcgen05LdRed(N, DAG)) {
7177 Results.push_back(std::get<0>(*Res));
7178 Results.push_back(std::get<1>(*Res));
7179 Results.push_back(std::get<2>(*Res));
7180 }
7181 return;
7182 }
7183}
7184
7187 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
7188 // result so that it can pass the legalization
7189 SDLoc DL(N);
7190 SDValue Chain = N->getOperand(0);
7191 SDValue Reg = N->getOperand(1);
7192 SDValue Glue = N->getOperand(2);
7193
7194 assert(Reg.getValueType() == MVT::i128 &&
7195 "Custom lowering for CopyFromReg with 128-bit reg only");
7196 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
7197 N->getValueType(2)};
7198 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
7199
7200 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
7201 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
7202 {NewValue.getValue(0), NewValue.getValue(1)});
7203
7204 Results.push_back(Pair);
7205 Results.push_back(NewValue.getValue(2));
7206 Results.push_back(NewValue.getValue(3));
7207}
7208
7210 const TargetLowering &TLI,
7212 SDValue Chain = N->getOperand(0);
7213 SDValue Reg = N->getOperand(1);
7214
7215 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
7216
7217 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
7218 SDValue NewProxy =
7219 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
7220 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
7221
7222 Results.push_back(Res);
7223}
7224
7226 const NVPTXSubtarget &STI,
7228 assert(N->getValueType(0) == MVT::i128 &&
7229 "Custom lowering for atomic128 only supports i128");
7230
7232 SDLoc dl(N);
7233
7234 if (!STI.hasAtomSwap128()) {
7237 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
7238 "requires target sm_90.",
7239 dl.getDebugLoc()));
7240
7241 Results.push_back(DAG.getUNDEF(MVT::i128));
7242 Results.push_back(AN->getOperand(0)); // Chain
7243 return;
7244 }
7245
7247 Ops.push_back(AN->getOperand(0)); // Chain
7248 Ops.push_back(AN->getOperand(1)); // Ptr
7249 for (const auto &Op : AN->ops().drop_front(2)) {
7250 // Low part
7251 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7252 DAG.getIntPtrConstant(0, dl)));
7253 // High part
7254 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7255 DAG.getIntPtrConstant(1, dl)));
7256 }
7257 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
7260 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
7261 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
7262 AN->getMemOperand());
7263 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
7264 {Result.getValue(0), Result.getValue(1)}));
7265 Results.push_back(Result.getValue(2));
7266}
7267
7268void NVPTXTargetLowering::ReplaceNodeResults(
7270 switch (N->getOpcode()) {
7271 default:
7272 report_fatal_error("Unhandled custom legalization");
7273 case ISD::BITCAST:
7274 ReplaceBITCAST(N, DAG, Results);
7275 return;
7276 case ISD::LOAD:
7277 case ISD::MLOAD:
7278 replaceLoadVector(N, DAG, Results, STI);
7279 return;
7282 return;
7283 case ISD::CopyFromReg:
7285 return;
7286 case NVPTXISD::ProxyReg:
7287 replaceProxyReg(N, DAG, *this, Results);
7288 return;
7290 case ISD::ATOMIC_SWAP:
7291 replaceAtomicSwap128(N, DAG, STI, Results);
7292 return;
7293 }
7294}
7295
7298 Type *Ty = AI->getValOperand()->getType();
7299
7300 if (AI->isFloatingPointOperation()) {
7302 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
7303 STI.getPTXVersion() >= 63)
7305 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
7306 STI.getPTXVersion() >= 78)
7308 if (Ty->isFloatTy())
7310 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
7312 }
7314 }
7315
7316 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
7317 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
7318
7319 switch (AI->getOperation()) {
7320 default:
7323 if (BitWidth == 128)
7325 [[fallthrough]];
7329 switch (BitWidth) {
7330 case 8:
7331 case 16:
7333 case 32:
7335 case 64:
7336 if (STI.hasAtomBitwise64())
7339 case 128:
7341 default:
7342 llvm_unreachable("unsupported width encountered");
7343 }
7350 switch (BitWidth) {
7351 case 8:
7352 case 16:
7354 case 32:
7356 case 64:
7357 if (STI.hasAtomMinMax64())
7360 case 128:
7362 default:
7363 llvm_unreachable("unsupported width encountered");
7364 }
7367 switch (BitWidth) {
7368 case 32:
7370 case 8:
7371 case 16:
7372 case 64:
7373 case 128:
7375 default:
7376 llvm_unreachable("unsupported width encountered");
7377 }
7378 }
7379
7381}
7382
7384 const Instruction *I) const {
7385 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
7386 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
7387 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
7388 // the memory order using explicit fences around the retry loop.
7389 // The memory order of natively supported CAS operations can be enforced
7390 // by lowering to an atom.cas with the right memory synchronizing effect.
7391 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
7392 // So we also use explicit fences for enforcing memory order for
7393 // seq_cast CAS with natively-supported bitwidths.
7394 return CI &&
7395 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
7396 STI.getMinCmpXchgSizeInBits() ||
7397 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
7398}
7399
7401 const Instruction *I) const {
7402 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
7403 bool BitwidthSupportedAndIsSeqCst =
7404 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
7405 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
7406 STI.getMinCmpXchgSizeInBits();
7407 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
7409}
7410
7412 Instruction *Inst,
7413 AtomicOrdering Ord) const {
7414 if (!isa<AtomicCmpXchgInst>(Inst))
7415 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
7416
7417 // Specialize for cmpxchg
7418 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
7419 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
7420 if (isReleaseOrStronger(Ord))
7421 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
7422 ? Ord
7424 SSID);
7425
7426 return nullptr;
7427}
7428
7430 Instruction *Inst,
7431 AtomicOrdering Ord) const {
7432 // Specialize for cmpxchg
7433 if (!isa<AtomicCmpXchgInst>(Inst))
7434 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
7435
7436 auto *CI = cast<AtomicCmpXchgInst>(Inst);
7437 auto CASWidth =
7438 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
7439 SyncScope::ID SSID = CI->getSyncScopeID();
7440 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
7441 if (isAcquireOrStronger(Ord) &&
7443 CASWidth < STI.getMinCmpXchgSizeInBits()))
7444 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
7445
7446 return nullptr;
7447}
7448
7449// Rather than default to SINT when both UINT and SINT are custom, we only
7450// change the opcode when UINT is not legal and SINT is. UINT is preferred when
7451// both are custom since unsigned CVT instructions can lead to slightly better
7452// SASS code with fewer instructions.
7454 EVT ToVT) const {
7455 if (isOperationLegal(Op, ToVT))
7456 return Op;
7457 switch (Op) {
7458 case ISD::FP_TO_UINT:
7460 return ISD::FP_TO_SINT;
7461 break;
7465 break;
7466 case ISD::VP_FP_TO_UINT:
7467 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
7468 return ISD::VP_FP_TO_SINT;
7469 break;
7470 default:
7471 break;
7472 }
7473 return Op;
7474}
7475
7476// Pin NVPTXTargetObjectFile's vtables to this file.
7478
7483
7485 const SelectionDAG &DAG, unsigned Depth) {
7486 SDValue A = Op.getOperand(0);
7487 SDValue B = Op.getOperand(1);
7488 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
7489 unsigned Mode = Op.getConstantOperandVal(3);
7490
7491 if (!Selector)
7492 return;
7493
7494 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
7495 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
7496
7497 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
7498 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
7499 "PRMT must have i32 operands");
7500 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
7501 KnownBits BitField = BKnown.concat(AKnown);
7502
7503 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
7504 for (unsigned I : llvm::seq(4)) {
7505 APInt Sel = SelectorVal.extractBits(4, I * 4);
7506 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7507 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7508 KnownBits Byte = BitField.extractBits(8, Idx * 8);
7509 if (Sign)
7510 Byte = KnownBits::ashr(Byte, 8);
7511 Known.insertBits(Byte, I * 8);
7512 }
7513}
7514
7515static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
7517
7518 // We can't do anything without knowing the sign bit.
7519 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
7520 if (ExtType == ISD::SEXTLOAD)
7521 return;
7522
7523 // ExtLoading to vector types is weird and may not work well with known bits.
7524 auto DestVT = LD->getValueType(0);
7525 if (DestVT.isVector())
7526 return;
7527
7528 assert(Known.getBitWidth() == DestVT.getSizeInBits());
7529 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
7530 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
7531}
7532
7534 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
7535 const SelectionDAG &DAG, unsigned Depth) const {
7536 Known.resetAll();
7537
7538 switch (Op.getOpcode()) {
7539 case NVPTXISD::PRMT:
7540 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
7541 break;
7542 case NVPTXISD::LoadV2:
7543 case NVPTXISD::LoadV4:
7544 case NVPTXISD::LoadV8:
7546 break;
7547 default:
7548 break;
7549 }
7550}
7551
7552static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
7553 const APInt &DemandedBits) {
7554 APInt DemandedLHS = APInt(32, 0);
7555 APInt DemandedRHS = APInt(32, 0);
7556
7557 for (unsigned I : llvm::seq(4)) {
7558 if (DemandedBits.extractBits(8, I * 8).isZero())
7559 continue;
7560
7561 APInt Sel = SelectorVal.extractBits(4, I * 4);
7562 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7563 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7564
7565 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
7566 unsigned ByteStart = (Idx % 4) * 8;
7567 if (Sign)
7568 Src.setBit(ByteStart + 7);
7569 else
7570 Src.setBits(ByteStart, ByteStart + 8);
7571 }
7572
7573 return {DemandedLHS, DemandedRHS};
7574}
7575
7576// Replace undef with 0 as this is easier for other optimizations such as
7577// known bits.
7579 if (!Op)
7580 return SDValue();
7581 if (Op.isUndef())
7582 return DAG.getConstant(0, SDLoc(), MVT::i32);
7583 return Op;
7584}
7585
7587 const APInt &DemandedBits,
7588 SelectionDAG &DAG,
7589 const TargetLowering &TLI,
7590 unsigned Depth) {
7591 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
7592 SDValue Op0 = PRMT.getOperand(0);
7593 SDValue Op1 = PRMT.getOperand(1);
7594 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
7595 if (!SelectorConst)
7596 return SDValue();
7597
7598 unsigned Mode = PRMT.getConstantOperandVal(3);
7599 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
7600
7601 // Try to simplify the PRMT to one of the inputs if the used bytes are all
7602 // from the same input in the correct order.
7603 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
7604 const unsigned SelBits = (4 - LeadingBytes) * 4;
7605 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
7606 return Op0;
7607 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
7608 return Op1;
7609
7610 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
7611
7612 // Attempt to avoid multi-use ops if we don't need anything from them.
7613 SDValue DemandedOp0 =
7614 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
7615 SDValue DemandedOp1 =
7616 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
7617
7618 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
7619 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
7620 if ((DemandedOp0 && DemandedOp0 != Op0) ||
7621 (DemandedOp1 && DemandedOp1 != Op1)) {
7622 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
7623 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
7624 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
7625 }
7626
7627 return SDValue();
7628}
7629
7631 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
7632 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
7633 Known.resetAll();
7634
7635 switch (Op.getOpcode()) {
7636 case NVPTXISD::PRMT:
7638 *this, Depth)) {
7639 TLO.CombineTo(Op, Result);
7640 return true;
7641 }
7642 break;
7643 default:
7644 break;
7645 }
7646
7647 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
7648 return false;
7649}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static SDValue reportInvalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG, unsigned Val)
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue PerformSELECTShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Transform patterns like: (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt)) (select (ult...
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static unsigned getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static unsigned getTcgen05LdRedID(Intrinsic::ID IID)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static std::optional< unsigned > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isConstZero(const SDValue &Operand)
static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE)
static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG)
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static std::pair< MemSDNode *, uint32_t > convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static std::optional< std::tuple< SDValue, SDValue, SDValue > > lowerTcgen05LdRed(SDNode *N, SelectionDAG &DAG)
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG)
static SDValue combineIntrinsicWOChain(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue combineF16AddWithNeg(SDNode *N, SelectionDAG &DAG, Intrinsic::ID AddIntrinsicID)
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1549
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1400
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1339
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:436
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1131
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:433
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:639
Module * getParent()
Get the module that this global value is contained inside of...
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
static constexpr unsigned NoRegister
Definition MCRegister.h:60
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:517
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const
bool hasUsedBytesMaskPragma() const
bool hasTensormapReplaceElemtypeSupport(unsigned value) const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3166
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ POISON
POISON - A poison node.
Definition ISDOpcodes.h:236
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:386
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:241
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ VECREDUCE_FMINIMUM
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:257
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2016
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os, -Oz
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:246
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:232
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...