LLVM 20.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
36#include "llvm/IR/Argument.h"
37#include "llvm/IR/Attributes.h"
38#include "llvm/IR/Constants.h"
39#include "llvm/IR/DataLayout.h"
42#include "llvm/IR/FPEnv.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/GlobalValue.h"
45#include "llvm/IR/Instruction.h"
47#include "llvm/IR/IntrinsicsNVPTX.h"
48#include "llvm/IR/Module.h"
49#include "llvm/IR/Type.h"
50#include "llvm/IR/Value.h"
60#include <algorithm>
61#include <cassert>
62#include <cmath>
63#include <cstdint>
64#include <iterator>
65#include <optional>
66#include <string>
67#include <utility>
68#include <vector>
69
70#define DEBUG_TYPE "nvptx-lower"
71
72using namespace llvm;
73
74static std::atomic<unsigned> GlobalUniqueCallSite;
75
77 "nvptx-sched4reg",
78 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
79
81 "nvptx-fma-level", cl::Hidden,
82 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
83 " 1: do it 2: do it aggressively"),
84 cl::init(2));
85
87 "nvptx-prec-divf32", cl::Hidden,
88 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
89 " IEEE Compliant F32 div.rnd if available."),
90 cl::init(2));
91
93 "nvptx-prec-sqrtf32", cl::Hidden,
94 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
95 cl::init(true));
96
98 "nvptx-force-min-byval-param-align", cl::Hidden,
99 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
100 " params of device functions."),
101 cl::init(false));
102
104 if (UsePrecDivF32.getNumOccurrences() > 0) {
105 // If nvptx-prec-div32=N is used on the command-line, always honor it
106 return UsePrecDivF32;
107 } else {
108 // Otherwise, use div.approx if fast math is enabled
109 if (getTargetMachine().Options.UnsafeFPMath)
110 return 0;
111 else
112 return 2;
113 }
114}
115
118 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
119 return UsePrecSqrtF32;
120 } else {
121 // Otherwise, use sqrt.approx if fast math is enabled
123 }
124}
125
129}
130
131static bool IsPTXVectorType(MVT VT) {
132 switch (VT.SimpleTy) {
133 default:
134 return false;
135 case MVT::v2i1:
136 case MVT::v4i1:
137 case MVT::v2i8:
138 case MVT::v4i8:
139 case MVT::v8i8: // <2 x i8x4>
140 case MVT::v16i8: // <4 x i8x4>
141 case MVT::v2i16:
142 case MVT::v4i16:
143 case MVT::v8i16: // <4 x i16x2>
144 case MVT::v2i32:
145 case MVT::v4i32:
146 case MVT::v2i64:
147 case MVT::v2f16:
148 case MVT::v4f16:
149 case MVT::v8f16: // <4 x f16x2>
150 case MVT::v2bf16:
151 case MVT::v4bf16:
152 case MVT::v8bf16: // <4 x bf16x2>
153 case MVT::v2f32:
154 case MVT::v4f32:
155 case MVT::v2f64:
156 return true;
157 }
158}
159
160static bool Is16bitsType(MVT VT) {
161 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
162 VT.SimpleTy == MVT::i16);
163}
164
165// When legalizing vector loads/stores, this function is called, which does two
166// things:
167// 1. Determines Whether the vector is something we want to custom lower,
168// std::nullopt is returned if we do not want to custom lower it.
169// 2. If we do want to handle it, returns two parameters:
170// - unsigned int NumElts - The number of elements in the final vector
171// - EVT EltVT - The type of the elements in the final vector
172static std::optional<std::pair<unsigned int, EVT>>
174 if (!VectorVT.isVector() || !VectorVT.isSimple())
175 return std::nullopt;
176
177 EVT EltVT = VectorVT.getVectorElementType();
178 unsigned NumElts = VectorVT.getVectorNumElements();
179
180 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
181 // legal. We can (and should) split that into 2 stores of <2 x double> here
182 // but I'm leaving that as a TODO for now.
183 switch (VectorVT.getSimpleVT().SimpleTy) {
184 default:
185 return std::nullopt;
186 case MVT::v2i8:
187 case MVT::v2i16:
188 case MVT::v2i32:
189 case MVT::v2i64:
190 case MVT::v2f16:
191 case MVT::v2bf16:
192 case MVT::v2f32:
193 case MVT::v2f64:
194 case MVT::v4i8:
195 case MVT::v4i16:
196 case MVT::v4i32:
197 case MVT::v4f16:
198 case MVT::v4bf16:
199 case MVT::v4f32:
200 // This is a "native" vector type
201 return std::pair(NumElts, EltVT);
202 case MVT::v8i8: // <2 x i8x4>
203 case MVT::v8f16: // <4 x f16x2>
204 case MVT::v8bf16: // <4 x bf16x2>
205 case MVT::v8i16: // <4 x i16x2>
206 case MVT::v16i8: // <4 x i8x4>
207 // This can be upsized into a "native" vector type.
208 // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
209 // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
210 // vectorized loads/stores with the actual element type for i8/i16 as that
211 // would require v8/v16 variants that do not exist.
212 // In order to load/store such vectors efficiently, here in Type
213 // Legalization, we split the vector into word-sized chunks (v2x16/v4i8).
214 // Later, we will lower to PTX as vectors of b32.
215
216 // Number of elements to pack in one word.
217 unsigned NPerWord = 32 / EltVT.getSizeInBits();
218
219 return std::pair(NumElts / NPerWord,
220 MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord));
221 }
222
223 llvm_unreachable("All cases in switch should return.");
224}
225
226/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
227/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
228/// into their primitive components.
229/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
230/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
231/// LowerCall, and LowerReturn.
232static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
233 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
234 SmallVectorImpl<uint64_t> *Offsets = nullptr,
235 uint64_t StartingOffset = 0) {
236 SmallVector<EVT, 16> TempVTs;
237 SmallVector<uint64_t, 16> TempOffsets;
238
239 // Special case for i128 - decompose to (i64, i64)
240 if (Ty->isIntegerTy(128)) {
241 ValueVTs.push_back(EVT(MVT::i64));
242 ValueVTs.push_back(EVT(MVT::i64));
243
244 if (Offsets) {
245 Offsets->push_back(StartingOffset + 0);
246 Offsets->push_back(StartingOffset + 8);
247 }
248
249 return;
250 }
251
252 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
253 if (StructType *STy = dyn_cast<StructType>(Ty)) {
254 auto const *SL = DL.getStructLayout(STy);
255 auto ElementNum = 0;
256 for(auto *EI : STy->elements()) {
257 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
258 StartingOffset + SL->getElementOffset(ElementNum));
259 ++ElementNum;
260 }
261 return;
262 }
263
264 // Given an array type, recursively traverse the elements with custom ComputePTXValueVTs.
265 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
266 Type *EltTy = ATy->getElementType();
267 uint64_t EltSize = DL.getTypeAllocSize(EltTy);
268 for (int I : llvm::seq<int>(ATy->getNumElements()))
269 ComputePTXValueVTs(TLI, DL, EltTy, ValueVTs, Offsets, StartingOffset + I * EltSize);
270 return;
271 }
272
273 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
274 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
275 EVT VT = TempVTs[i];
276 uint64_t Off = TempOffsets[i];
277 // Split vectors into individual elements, except for v2f16, which
278 // we will pass as a single scalar.
279 if (VT.isVector()) {
280 unsigned NumElts = VT.getVectorNumElements();
281 EVT EltVT = VT.getVectorElementType();
282 // We require power-of-2 sized vectors becuase
283 // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
284 // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
285 // vectors.
286 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&
287 isPowerOf2_32(NumElts)) {
288 // Vectors with an even number of f16 elements will be passed to
289 // us as an array of v2f16/v2bf16 elements. We must match this so we
290 // stay in sync with Ins/Outs.
291 switch (EltVT.getSimpleVT().SimpleTy) {
292 case MVT::f16:
293 EltVT = MVT::v2f16;
294 break;
295 case MVT::bf16:
296 EltVT = MVT::v2bf16;
297 break;
298 case MVT::i16:
299 EltVT = MVT::v2i16;
300 break;
301 default:
302 llvm_unreachable("Unexpected type");
303 }
304 NumElts /= 2;
305 } else if (EltVT.getSimpleVT() == MVT::i8 &&
306 ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) ||
307 NumElts == 3)) {
308 // v*i8 are formally lowered as v4i8
309 EltVT = MVT::v4i8;
310 NumElts = (NumElts + 3) / 4;
311 } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {
312 // v2i8 is promoted to v2i16
313 NumElts = 1;
314 EltVT = MVT::v2i16;
315 }
316 for (unsigned j = 0; j != NumElts; ++j) {
317 ValueVTs.push_back(EltVT);
318 if (Offsets)
319 Offsets->push_back(Off + j * EltVT.getStoreSize());
320 }
321 } else {
322 ValueVTs.push_back(VT);
323 if (Offsets)
324 Offsets->push_back(Off);
325 }
326 }
327}
328
329/// PromoteScalarIntegerPTX
330/// Used to make sure the arguments/returns are suitable for passing
331/// and promote them to a larger size if they're not.
332///
333/// The promoted type is placed in \p PromoteVT if the function returns true.
334static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
335 if (VT.isScalarInteger()) {
336 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
337 default:
339 "Promotion is not suitable for scalars of size larger than 64-bits");
340 case 1:
341 *PromotedVT = MVT::i1;
342 break;
343 case 2:
344 case 4:
345 case 8:
346 *PromotedVT = MVT::i8;
347 break;
348 case 16:
349 *PromotedVT = MVT::i16;
350 break;
351 case 32:
352 *PromotedVT = MVT::i32;
353 break;
354 case 64:
355 *PromotedVT = MVT::i64;
356 break;
357 }
358 return EVT(*PromotedVT) != VT;
359 }
360 return false;
361}
362
363// Check whether we can merge loads/stores of some of the pieces of a
364// flattened function parameter or return value into a single vector
365// load/store.
366//
367// The flattened parameter is represented as a list of EVTs and
368// offsets, and the whole structure is aligned to ParamAlignment. This
369// function determines whether we can load/store pieces of the
370// parameter starting at index Idx using a single vectorized op of
371// size AccessSize. If so, it returns the number of param pieces
372// covered by the vector op. Otherwise, it returns 1.
374 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
375 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
376
377 // Can't vectorize if param alignment is not sufficient.
378 if (ParamAlignment < AccessSize)
379 return 1;
380 // Can't vectorize if offset is not aligned.
381 if (Offsets[Idx] & (AccessSize - 1))
382 return 1;
383
384 EVT EltVT = ValueVTs[Idx];
385 unsigned EltSize = EltVT.getStoreSize();
386
387 // Element is too large to vectorize.
388 if (EltSize >= AccessSize)
389 return 1;
390
391 unsigned NumElts = AccessSize / EltSize;
392 // Can't vectorize if AccessBytes if not a multiple of EltSize.
393 if (AccessSize != EltSize * NumElts)
394 return 1;
395
396 // We don't have enough elements to vectorize.
397 if (Idx + NumElts > ValueVTs.size())
398 return 1;
399
400 // PTX ISA can only deal with 2- and 4-element vector ops.
401 if (NumElts != 4 && NumElts != 2)
402 return 1;
403
404 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
405 // Types do not match.
406 if (ValueVTs[j] != EltVT)
407 return 1;
408
409 // Elements are not contiguous.
410 if (Offsets[j] - Offsets[j - 1] != EltSize)
411 return 1;
412 }
413 // OK. We can vectorize ValueVTs[i..i+NumElts)
414 return NumElts;
415}
416
417// Flags for tracking per-element vectorization state of loads/stores
418// of a flattened function parameter or return value.
420 PVF_INNER = 0x0, // Middle elements of a vector.
421 PVF_FIRST = 0x1, // First element of the vector.
422 PVF_LAST = 0x2, // Last element of the vector.
423 // Scalar is effectively a 1-element vector.
426
427// Computes whether and how we can vectorize the loads/stores of a
428// flattened function parameter or return value.
429//
430// The flattened parameter is represented as the list of ValueVTs and
431// Offsets, and is aligned to ParamAlignment bytes. We return a vector
432// of the same size as ValueVTs indicating how each piece should be
433// loaded/stored (i.e. as a scalar, or as part of a vector
434// load/store).
437 const SmallVectorImpl<uint64_t> &Offsets,
438 Align ParamAlignment, bool IsVAArg = false) {
439 // Set vector size to match ValueVTs and mark all elements as
440 // scalars by default.
442 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
443
444 if (IsVAArg)
445 return VectorInfo;
446
447 // Check what we can vectorize using 128/64/32-bit accesses.
448 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
449 // Skip elements we've already processed.
450 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
451 for (unsigned AccessSize : {16, 8, 4, 2}) {
452 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
453 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
454 // Mark vectorized elements.
455 switch (NumElts) {
456 default:
457 llvm_unreachable("Unexpected return value");
458 case 1:
459 // Can't vectorize using this size, try next smaller size.
460 continue;
461 case 2:
462 assert(I + 1 < E && "Not enough elements.");
463 VectorInfo[I] = PVF_FIRST;
464 VectorInfo[I + 1] = PVF_LAST;
465 I += 1;
466 break;
467 case 4:
468 assert(I + 3 < E && "Not enough elements.");
469 VectorInfo[I] = PVF_FIRST;
470 VectorInfo[I + 1] = PVF_INNER;
471 VectorInfo[I + 2] = PVF_INNER;
472 VectorInfo[I + 3] = PVF_LAST;
473 I += 3;
474 break;
475 }
476 // Break out of the inner loop because we've already succeeded
477 // using largest possible AccessSize.
478 break;
479 }
480 }
481 return VectorInfo;
482}
483
485 SDValue Value) {
486 if (Value->getValueType(0) == VT)
487 return Value;
488 return DAG.getNode(ISD::BITCAST, DL, VT, Value);
489}
490
491// NVPTXTargetLowering Constructor.
493 const NVPTXSubtarget &STI)
494 : TargetLowering(TM), nvTM(&TM), STI(STI) {
495 // always lower memset, memcpy, and memmove intrinsics to load/store
496 // instructions, rather
497 // then generating calls to memset, mempcy or memmove.
501
504
505 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
506 // condition branches.
507 setJumpIsExpensive(true);
508
509 // Wide divides are _very_ slow. Try to reduce the width of the divide if
510 // possible.
511 addBypassSlowDiv(64, 32);
512
513 // By default, use the Source scheduling
514 if (sched4reg)
516 else
518
519 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
520 LegalizeAction NoF16Action) {
521 bool IsOpSupported = STI.allowFP16Math();
522 switch (Op) {
523 // Several FP16 instructions are available on sm_80 only.
524 case ISD::FMINNUM:
525 case ISD::FMAXNUM:
528 case ISD::FMAXIMUM:
529 case ISD::FMINIMUM:
530 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
531 break;
532 }
533 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
534 };
535
536 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
537 LegalizeAction NoBF16Action) {
538 bool IsOpSupported = STI.hasBF16Math();
539 switch (Op) {
540 // Several BF16 instructions are available on sm_90 only.
541 case ISD::FADD:
542 case ISD::FMUL:
543 case ISD::FSUB:
544 case ISD::SELECT:
545 case ISD::SELECT_CC:
546 case ISD::SETCC:
547 case ISD::FEXP2:
548 case ISD::FCEIL:
549 case ISD::FFLOOR:
550 case ISD::FNEARBYINT:
551 case ISD::FRINT:
552 case ISD::FROUNDEVEN:
553 case ISD::FTRUNC:
554 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
555 break;
556 // Several BF16 instructions are available on sm_80 only.
557 case ISD::FMINNUM:
558 case ISD::FMAXNUM:
561 case ISD::FMAXIMUM:
562 case ISD::FMINIMUM:
563 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
564 break;
565 }
567 Op, VT, IsOpSupported ? Action : NoBF16Action);
568 };
569
570 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
571 LegalizeAction NoI16x2Action) {
572 bool IsOpSupported = false;
573 // instructions are available on sm_90 only
574 switch (Op) {
575 case ISD::ADD:
576 case ISD::SMAX:
577 case ISD::SMIN:
578 case ISD::UMIN:
579 case ISD::UMAX:
580 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
581 break;
582 }
583 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
584 };
585
586 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
587 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
588 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
589 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
590 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
591 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
592 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
593 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
594 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
595 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
596 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
597 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
598
599 // Conversion to/from FP16/FP16x2 is always legal.
604
606 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
608
609 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
610 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
611
612 // Conversion to/from BFP16/BFP16x2 is always legal.
617
618 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
619 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
620 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
621 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
622
623 // Conversion to/from i16/i16x2 is always legal.
628
633
634 // Custom conversions to/from v2i8.
636
637 // Only logical ops can be done on v4i8 directly, others must be done
638 // elementwise.
655 MVT::v4i8, Expand);
656
657 // Operations not directly supported by NVPTX.
658 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
659 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
660 MVT::i32, MVT::i64}) {
663 }
664
665 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
666 // For others we will expand to a SHL/SRA pair.
673
680
683
685 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
686 Expand);
687
688 if (STI.hasHWROT32())
690
692
695
698
699 // We want to legalize constant related memmove and memcopy
700 // intrinsics.
702
703 // Turn FP extload into load/fpextend
704 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
705 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
706 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
707 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
708 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
709 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
710 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
711 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
712 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
713 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
714 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
715 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
716 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
717 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
718 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
719 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
720 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
721 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
722 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
723 // Turn FP truncstore into trunc + store.
724 // FIXME: vector types should also be expanded
725 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
726 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
727 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
728 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
729 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
730
731 // PTX does not support load / store predicate registers
734
735 for (MVT VT : MVT::integer_valuetypes()) {
739 setTruncStoreAction(VT, MVT::i1, Expand);
740 }
741
745 MVT::i1, Expand);
746
747 // expand extload of vector of integers.
749 MVT::v2i8, Expand);
750 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
751
752 // This is legal in NVPTX
757
758 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
760
761 // TRAP can be lowered to PTX trap
762 setOperationAction(ISD::TRAP, MVT::Other, Legal);
763 // DEBUGTRAP can be lowered to PTX brkpt
765
766 // Register custom handling for vector loads/stores
768 if (IsPTXVectorType(VT)) {
772 }
773 }
774
775 // Support varargs.
780
781 // Custom handling for i8 intrinsics
783
784 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
790
793 }
794
795 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
796 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
797 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
798 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
799 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
800 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
801 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
802
803 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
804 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
805 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
806 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
807 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
808 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
809
810 // Other arithmetic and logic ops are unsupported.
814 MVT::v2i16, Expand);
815
820 if (STI.getPTXVersion() >= 43) {
825 }
826
828 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
831
832 // PTX does not directly support SELP of i1, so promote to i32 first
834
835 // PTX cannot multiply two i64s in a single instruction.
838
839 // We have some custom DAG combine patterns for these nodes
843
844 // setcc for f16x2 and bf16x2 needs special handling to prevent
845 // legalizer's attempt to scalarize it due to v2i1 not being legal.
846 if (STI.allowFP16Math() || STI.hasBF16Math())
848
849 // Promote fp16 arithmetic if fp16 hardware isn't available or the
850 // user passed --nvptx-no-fp16-math. The flag is useful because,
851 // although sm_53+ GPUs have some sort of FP16 support in
852 // hardware, only sm_53 and sm_60 have full implementation. Others
853 // only have token amount of hardware and are likely to run faster
854 // by using fp32 units instead.
855 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
856 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
857 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
858 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
859 // bf16 must be promoted to f32.
860 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
861 if (getOperationAction(Op, MVT::bf16) == Promote)
862 AddPromotedToType(Op, MVT::bf16, MVT::f32);
863 }
864
865 // f16/f16x2 neg was introduced in PTX 60, SM_53.
866 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
867 STI.getPTXVersion() >= 60 &&
868 STI.allowFP16Math();
869 for (const auto &VT : {MVT::f16, MVT::v2f16})
871 IsFP16FP16x2NegAvailable ? Legal : Expand);
872
873 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
874 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
875 // (would be) Library functions.
876
877 // These map to conversion instructions for scalar FP types.
878 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
880 setOperationAction(Op, MVT::f16, Legal);
881 setOperationAction(Op, MVT::f32, Legal);
882 setOperationAction(Op, MVT::f64, Legal);
883 setOperationAction(Op, MVT::v2f16, Expand);
884 setOperationAction(Op, MVT::v2bf16, Expand);
885 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
886 if (getOperationAction(Op, MVT::bf16) == Promote)
887 AddPromotedToType(Op, MVT::bf16, MVT::f32);
888 }
889
890 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
892 }
893 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
894 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
897 }
898 }
899
900 // sm_80 only has conversions between f32 and bf16. Custom lower all other
901 // bf16 conversions.
902 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
903 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
906 VT, Custom);
907 }
910 MVT::bf16, Custom);
911 }
912
919 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
920
921 // 'Expand' implements FCOPYSIGN without calling an external library.
928
929 // These map to corresponding instructions for f32/f64. f16 must be
930 // promoted to f32. v2f16 is expanded to f16, which is then promoted
931 // to f32.
932 for (const auto &Op :
934 setOperationAction(Op, MVT::f16, Promote);
935 setOperationAction(Op, MVT::f32, Legal);
936 setOperationAction(Op, MVT::f64, Legal);
937 setOperationAction(Op, MVT::v2f16, Expand);
938 setOperationAction(Op, MVT::v2bf16, Expand);
939 setOperationAction(Op, MVT::bf16, Promote);
940 AddPromotedToType(Op, MVT::bf16, MVT::f32);
941 }
942
943 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
944 if (STI.getPTXVersion() >= 65) {
945 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
946 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
947 } else {
949 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
950 }
951 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
952 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
953 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
954 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
955
956 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
957 setOperationAction(Op, MVT::f32, Legal);
958 setOperationAction(Op, MVT::f64, Legal);
959 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
960 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
961 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
962 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
963 if (getOperationAction(Op, MVT::bf16) == Promote)
964 AddPromotedToType(Op, MVT::bf16, MVT::f32);
965 }
966 bool SupportsF32MinMaxNaN =
967 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
968 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
969 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
970 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
971 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
972 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
973 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
974 }
975
976 // Custom lowering for inline asm with 128-bit operands
979
980 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
981 // No FPOW or FREM in PTX.
982
983 // Now deduce the information based on the above mentioned
984 // actions
986
987 setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
990}
991
992const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
993
994#define MAKE_CASE(V) \
995 case V: \
996 return #V;
997
998 switch ((NVPTXISD::NodeType)Opcode) {
1000 break;
1001
1064 }
1065 return nullptr;
1066
1067#undef MAKE_CASE
1068}
1069
1072 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1073 VT.getScalarType() == MVT::i1)
1074 return TypeSplitVector;
1076}
1077
1079 int Enabled, int &ExtraSteps,
1080 bool &UseOneConst,
1081 bool Reciprocal) const {
1084 return SDValue();
1085
1086 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1087 ExtraSteps = 0;
1088
1089 SDLoc DL(Operand);
1090 EVT VT = Operand.getValueType();
1091 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1092
1093 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1094 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1095 DAG.getConstant(IID, DL, MVT::i32), Operand);
1096 };
1097
1098 // The sqrt and rsqrt refinement processes assume we always start out with an
1099 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1100 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1101 // any refinement, we must return a regular sqrt.
1102 if (Reciprocal || ExtraSteps > 0) {
1103 if (VT == MVT::f32)
1104 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1105 : Intrinsic::nvvm_rsqrt_approx_f);
1106 else if (VT == MVT::f64)
1107 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1108 else
1109 return SDValue();
1110 } else {
1111 if (VT == MVT::f32)
1112 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1113 : Intrinsic::nvvm_sqrt_approx_f);
1114 else {
1115 // There's no sqrt.approx.f64 instruction, so we emit
1116 // reciprocal(rsqrt(x)). This is faster than
1117 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1118 // x * rsqrt(x).)
1119 return DAG.getNode(
1121 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1122 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1123 }
1124 }
1125}
1126
1127SDValue
1129 SDLoc dl(Op);
1130 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1131 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1132 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1133 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1134}
1135
1136static bool IsTypePassedAsArray(const Type *Ty) {
1137 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1138 Ty->isHalfTy() || Ty->isBFloatTy();
1139}
1140
1142 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1143 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1144 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1145 const CallBase &CB, unsigned UniqueCallSite) const {
1146 auto PtrVT = getPointerTy(DL);
1147
1148 bool isABI = (STI.getSmVersion() >= 20);
1149 assert(isABI && "Non-ABI compilation is not supported");
1150 if (!isABI)
1151 return "";
1152
1153 std::string Prototype;
1154 raw_string_ostream O(Prototype);
1155 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1156
1157 if (retTy->getTypeID() == Type::VoidTyID) {
1158 O << "()";
1159 } else {
1160 O << "(";
1161 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1162 !IsTypePassedAsArray(retTy)) {
1163 unsigned size = 0;
1164 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1165 size = ITy->getBitWidth();
1166 } else {
1167 assert(retTy->isFloatingPointTy() &&
1168 "Floating point type expected here");
1169 size = retTy->getPrimitiveSizeInBits();
1170 }
1171 // PTX ABI requires all scalar return values to be at least 32
1172 // bits in size. fp16 normally uses .b16 as its storage type in
1173 // PTX, so its size must be adjusted here, too.
1175
1176 O << ".param .b" << size << " _";
1177 } else if (isa<PointerType>(retTy)) {
1178 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1179 } else if (IsTypePassedAsArray(retTy)) {
1180 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1181 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1182 } else {
1183 llvm_unreachable("Unknown return type");
1184 }
1185 O << ") ";
1186 }
1187 O << "_ (";
1188
1189 bool first = true;
1190
1191 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1192 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1193 Type *Ty = Args[i].Ty;
1194 if (!first) {
1195 O << ", ";
1196 }
1197 first = false;
1198
1199 if (!Outs[OIdx].Flags.isByVal()) {
1200 if (IsTypePassedAsArray(Ty)) {
1201 Align ParamAlign =
1202 getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL);
1203 O << ".param .align " << ParamAlign.value() << " .b8 ";
1204 O << "_";
1205 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1206 // update the index for Outs
1207 SmallVector<EVT, 16> vtparts;
1208 ComputeValueVTs(*this, DL, Ty, vtparts);
1209 if (unsigned len = vtparts.size())
1210 OIdx += len - 1;
1211 continue;
1212 }
1213 // i8 types in IR will be i16 types in SDAG
1214 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1215 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1216 "type mismatch between callee prototype and arguments");
1217 // scalar type
1218 unsigned sz = 0;
1219 if (isa<IntegerType>(Ty)) {
1220 sz = cast<IntegerType>(Ty)->getBitWidth();
1222 } else if (isa<PointerType>(Ty)) {
1223 sz = PtrVT.getSizeInBits();
1224 } else {
1225 sz = Ty->getPrimitiveSizeInBits();
1226 }
1227 O << ".param .b" << sz << " ";
1228 O << "_";
1229 continue;
1230 }
1231
1232 // Indirect calls need strict ABI alignment so we disable optimizations by
1233 // not providing a function to optimize.
1234 Type *ETy = Args[i].IndirectType;
1235 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1236 Align ParamByValAlign =
1237 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1238
1239 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1240 O << "_";
1241 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1242 }
1243
1244 if (VAInfo)
1245 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1246 << " .b8 _[]\n";
1247 O << ")";
1249 O << " .noreturn";
1250 O << ";";
1251
1252 return Prototype;
1253}
1254
1256 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1257 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1258}
1259
1260Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1261 unsigned Idx,
1262 const DataLayout &DL) const {
1263 if (!CB) {
1264 // CallSite is zero, fallback to ABI type alignment
1265 return DL.getABITypeAlign(Ty);
1266 }
1267
1268 const Function *DirectCallee = CB->getCalledFunction();
1269
1270 if (!DirectCallee) {
1271 // We don't have a direct function symbol, but that may be because of
1272 // constant cast instructions in the call.
1273
1274 // With bitcast'd call targets, the instruction will be the call
1275 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1276 // Check if we have call alignment metadata
1277 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1278 return StackAlign.value();
1279 }
1280 DirectCallee = getMaybeBitcastedCallee(CB);
1281 }
1282
1283 // Check for function alignment information if we found that the
1284 // ultimate target is a Function
1285 if (DirectCallee)
1286 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1287
1288 // Call is indirect, fall back to the ABI type alignment
1289 return DL.getABITypeAlign(Ty);
1290}
1291
1292static bool adjustElementType(EVT &ElementType) {
1293 switch (ElementType.getSimpleVT().SimpleTy) {
1294 default:
1295 return false;
1296 case MVT::f16:
1297 case MVT::bf16:
1298 ElementType = MVT::i16;
1299 return true;
1300 case MVT::f32:
1301 case MVT::v2f16:
1302 case MVT::v2bf16:
1303 ElementType = MVT::i32;
1304 return true;
1305 case MVT::f64:
1306 ElementType = MVT::i64;
1307 return true;
1308 }
1309}
1310
1311// Use byte-store when the param address of the argument value is unaligned.
1312// This may happen when the return value is a field of a packed structure.
1313//
1314// This is called in LowerCall() when passing the param values.
1316 uint64_t Offset, EVT ElementType,
1317 SDValue StVal, SDValue &InGlue,
1318 unsigned ArgID, const SDLoc &dl) {
1319 // Bit logic only works on integer types
1320 if (adjustElementType(ElementType))
1321 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1322
1323 // Store each byte
1324 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1325 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1326 // Shift the byte to the last byte position
1327 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1328 DAG.getConstant(i * 8, dl, MVT::i32));
1329 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1330 DAG.getConstant(Offset + i, dl, MVT::i32),
1331 ShiftVal, InGlue};
1332 // Trunc store only the last byte by using
1333 // st.param.b8
1334 // The register type can be larger than b8.
1335 Chain = DAG.getMemIntrinsicNode(
1336 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1338 InGlue = Chain.getValue(1);
1339 }
1340 return Chain;
1341}
1342
1343// Use byte-load when the param adress of the returned value is unaligned.
1344// This may happen when the returned value is a field of a packed structure.
1345static SDValue
1347 EVT ElementType, SDValue &InGlue,
1348 SmallVectorImpl<SDValue> &TempProxyRegOps,
1349 const SDLoc &dl) {
1350 // Bit logic only works on integer types
1351 EVT MergedType = ElementType;
1352 adjustElementType(MergedType);
1353
1354 // Load each byte and construct the whole value. Initial value to 0
1355 SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1356 // LoadParamMemI8 loads into i16 register only
1357 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1358 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1359 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1360 DAG.getConstant(Offset + i, dl, MVT::i32),
1361 InGlue};
1362 // This will be selected to LoadParamMemI8
1363 SDValue LdVal =
1364 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1365 MVT::i8, MachinePointerInfo(), Align(1));
1366 SDValue TmpLdVal = LdVal.getValue(0);
1367 Chain = LdVal.getValue(1);
1368 InGlue = LdVal.getValue(2);
1369
1370 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1371 TmpLdVal.getSimpleValueType(), TmpLdVal);
1372 TempProxyRegOps.push_back(TmpLdVal);
1373
1374 SDValue CMask = DAG.getConstant(255, dl, MergedType);
1375 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1376 // Need to extend the i16 register to the whole width.
1377 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1378 // Mask off the high bits. Leave only the lower 8bits.
1379 // Do this because we are using loadparam.b8.
1380 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1381 // Shift and merge
1382 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1383 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1384 }
1385 if (ElementType != MergedType)
1386 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1387
1388 return RetVal;
1389}
1390
1392 const GlobalAddressSDNode *Func) {
1393 if (!Func)
1394 return false;
1395 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1396 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1397 return false;
1398}
1399
1401 SmallVectorImpl<SDValue> &InVals) const {
1402
1403 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1405 "Support for variadic functions (unsized array parameter) introduced "
1406 "in PTX ISA version 6.0 and requires target sm_30.");
1407
1408 SelectionDAG &DAG = CLI.DAG;
1409 SDLoc dl = CLI.DL;
1411 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1413 SDValue Chain = CLI.Chain;
1414 SDValue Callee = CLI.Callee;
1415 bool &isTailCall = CLI.IsTailCall;
1416 ArgListTy &Args = CLI.getArgs();
1417 Type *RetTy = CLI.RetTy;
1418 const CallBase *CB = CLI.CB;
1419 const DataLayout &DL = DAG.getDataLayout();
1420
1421 bool isABI = (STI.getSmVersion() >= 20);
1422 assert(isABI && "Non-ABI compilation is not supported");
1423 if (!isABI)
1424 return Chain;
1425
1426 // Variadic arguments.
1427 //
1428 // Normally, for each argument, we declare a param scalar or a param
1429 // byte array in the .param space, and store the argument value to that
1430 // param scalar or array starting at offset 0.
1431 //
1432 // In the case of the first variadic argument, we declare a vararg byte array
1433 // with size 0. The exact size of this array isn't known at this point, so
1434 // it'll be patched later. All the variadic arguments will be stored to this
1435 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1436 // initially set to 0, so it can be used for non-variadic arguments (which use
1437 // 0 offset) to simplify the code.
1438 //
1439 // After all vararg is processed, 'VAOffset' holds the size of the
1440 // vararg byte array.
1441
1442 SDValue VADeclareParam; // vararg byte array
1443 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1444 unsigned VAOffset = 0; // current offset in the param array
1445
1446 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1447 SDValue TempChain = Chain;
1448 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1449 SDValue InGlue = Chain.getValue(1);
1450
1451 unsigned ParamCount = 0;
1452 // Args.size() and Outs.size() need not match.
1453 // Outs.size() will be larger
1454 // * if there is an aggregate argument with multiple fields (each field
1455 // showing up separately in Outs)
1456 // * if there is a vector argument with more than typical vector-length
1457 // elements (generally if more than 4) where each vector element is
1458 // individually present in Outs.
1459 // So a different index should be used for indexing into Outs/OutVals.
1460 // See similar issue in LowerFormalArguments.
1461 unsigned OIdx = 0;
1462 // Declare the .params or .reg need to pass values
1463 // to the function
1464 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1465 EVT VT = Outs[OIdx].VT;
1466 Type *Ty = Args[i].Ty;
1467 bool IsVAArg = (i >= CLI.NumFixedArgs);
1468 bool IsByVal = Outs[OIdx].Flags.isByVal();
1469
1472
1473 assert((!IsByVal || Args[i].IndirectType) &&
1474 "byval arg must have indirect type");
1475 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1476 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1477
1478 Align ArgAlign;
1479 if (IsByVal) {
1480 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1481 // so we don't need to worry whether it's naturally aligned or not.
1482 // See TargetLowering::LowerCallTo().
1483 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1484 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1485 InitialAlign, DL);
1486 if (IsVAArg)
1487 VAOffset = alignTo(VAOffset, ArgAlign);
1488 } else {
1489 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1490 }
1491
1492 unsigned TypeSize =
1493 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1494 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1495
1496 bool NeedAlign; // Does argument declaration specify alignment?
1497 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1498 if (IsVAArg) {
1499 if (ParamCount == FirstVAArg) {
1500 SDValue DeclareParamOps[] = {
1501 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1502 DAG.getConstant(ParamCount, dl, MVT::i32),
1503 DAG.getConstant(1, dl, MVT::i32), InGlue};
1504 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1505 DeclareParamVTs, DeclareParamOps);
1506 }
1507 NeedAlign = PassAsArray;
1508 } else if (PassAsArray) {
1509 // declare .param .align <align> .b8 .param<n>[<size>];
1510 SDValue DeclareParamOps[] = {
1511 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1512 DAG.getConstant(ParamCount, dl, MVT::i32),
1513 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1514 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1515 DeclareParamOps);
1516 NeedAlign = true;
1517 } else {
1518 // declare .param .b<size> .param<n>;
1519 if (VT.isInteger() || VT.isFloatingPoint()) {
1520 // PTX ABI requires integral types to be at least 32 bits in
1521 // size. FP16 is loaded/stored using i16, so it's handled
1522 // here as well.
1524 }
1525 SDValue DeclareScalarParamOps[] = {
1526 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1527 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1528 DAG.getConstant(0, dl, MVT::i32), InGlue};
1529 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1530 DeclareScalarParamOps);
1531 NeedAlign = false;
1532 }
1533 InGlue = Chain.getValue(1);
1534
1535 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1536 // than 32-bits are sign extended or zero extended, depending on
1537 // whether they are signed or unsigned types. This case applies
1538 // only to scalar parameters and not to aggregate values.
1539 bool ExtendIntegerParam =
1540 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1541
1542 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1543 SmallVector<SDValue, 6> StoreOperands;
1544 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1545 EVT EltVT = VTs[j];
1546 int CurOffset = Offsets[j];
1547 MaybeAlign PartAlign;
1548 if (NeedAlign)
1549 PartAlign = commonAlignment(ArgAlign, CurOffset);
1550
1551 SDValue StVal = OutVals[OIdx];
1552
1553 MVT PromotedVT;
1554 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1555 EltVT = EVT(PromotedVT);
1556 }
1557 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1559 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1560 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1561 }
1562
1563 if (IsByVal) {
1564 auto PtrVT = getPointerTy(DL);
1565 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1566 DAG.getConstant(CurOffset, dl, PtrVT));
1567 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1568 PartAlign);
1569 } else if (ExtendIntegerParam) {
1570 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1571 // zext/sext to i32
1572 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1574 dl, MVT::i32, StVal);
1575 }
1576
1577 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1578 // Use 16-bit registers for small stores as it's the
1579 // smallest general purpose register size supported by NVPTX.
1580 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1581 }
1582
1583 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1584 // scalar store. In such cases, fall back to byte stores.
1585 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1586 PartAlign.value() <
1587 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1588 assert(StoreOperands.empty() && "Unfinished preceeding store.");
1590 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1591 StVal, InGlue, ParamCount, dl);
1592
1593 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1594 // into the SDAG, so just move on to the next element.
1595 if (!IsByVal)
1596 ++OIdx;
1597 continue;
1598 }
1599
1600 // New store.
1601 if (VectorInfo[j] & PVF_FIRST) {
1602 assert(StoreOperands.empty() && "Unfinished preceding store.");
1603 StoreOperands.push_back(Chain);
1604 StoreOperands.push_back(
1605 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1606
1607 StoreOperands.push_back(DAG.getConstant(
1608 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1609 dl, MVT::i32));
1610 }
1611
1612 // Record the value to store.
1613 StoreOperands.push_back(StVal);
1614
1615 if (VectorInfo[j] & PVF_LAST) {
1616 unsigned NumElts = StoreOperands.size() - 3;
1618 switch (NumElts) {
1619 case 1:
1621 break;
1622 case 2:
1624 break;
1625 case 4:
1627 break;
1628 default:
1629 llvm_unreachable("Invalid vector info.");
1630 }
1631
1632 StoreOperands.push_back(InGlue);
1633
1634 // Adjust type of the store op if we've extended the scalar
1635 // return value.
1636 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1637
1638 Chain = DAG.getMemIntrinsicNode(
1639 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1640 TheStoreType, MachinePointerInfo(), PartAlign,
1642 InGlue = Chain.getValue(1);
1643
1644 // Cleanup.
1645 StoreOperands.clear();
1646
1647 // TODO: We may need to support vector types that can be passed
1648 // as scalars in variadic arguments.
1649 if (!IsByVal && IsVAArg) {
1650 assert(NumElts == 1 &&
1651 "Vectorization is expected to be disabled for variadics.");
1652 VAOffset += DL.getTypeAllocSize(
1653 TheStoreType.getTypeForEVT(*DAG.getContext()));
1654 }
1655 }
1656 if (!IsByVal)
1657 ++OIdx;
1658 }
1659 assert(StoreOperands.empty() && "Unfinished parameter store.");
1660 if (!IsByVal && VTs.size() > 0)
1661 --OIdx;
1662 ++ParamCount;
1663 if (IsByVal && IsVAArg)
1664 VAOffset += TypeSize;
1665 }
1666
1667 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1668 MaybeAlign retAlignment = std::nullopt;
1669
1670 // Handle Result
1671 if (Ins.size() > 0) {
1672 SmallVector<EVT, 16> resvtparts;
1673 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1674
1675 // Declare
1676 // .param .align N .b8 retval0[<size-in-bytes>], or
1677 // .param .b<size-in-bits> retval0
1678 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1679 if (!IsTypePassedAsArray(RetTy)) {
1680 resultsz = promoteScalarArgumentSize(resultsz);
1681 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1682 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1683 DAG.getConstant(resultsz, dl, MVT::i32),
1684 DAG.getConstant(0, dl, MVT::i32), InGlue };
1685 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1686 DeclareRetOps);
1687 InGlue = Chain.getValue(1);
1688 } else {
1689 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1690 assert(retAlignment && "retAlignment is guaranteed to be set");
1691 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1692 SDValue DeclareRetOps[] = {
1693 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1694 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1695 DAG.getConstant(0, dl, MVT::i32), InGlue};
1696 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1697 DeclareRetOps);
1698 InGlue = Chain.getValue(1);
1699 }
1700 }
1701
1702 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1703 // Set the size of the vararg param byte array if the callee is a variadic
1704 // function and the variadic part is not empty.
1705 if (HasVAArgs) {
1706 SDValue DeclareParamOps[] = {
1707 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1708 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1709 VADeclareParam.getOperand(4)};
1710 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1711 VADeclareParam->getVTList(), DeclareParamOps);
1712 }
1713
1714 // If the type of the callsite does not match that of the function, convert
1715 // the callsite to an indirect call.
1716 bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1717
1718 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1719 // between them we must rely on the call site value which is valid for
1720 // indirect calls but is always null for libcalls.
1721 bool isIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1722
1723 if (isa<ExternalSymbolSDNode>(Callee)) {
1724 Function* CalleeFunc = nullptr;
1725
1726 // Try to find the callee in the current module.
1727 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1728 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1729
1730 // Set the "libcall callee" attribute to indicate that the function
1731 // must always have a declaration.
1732 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1733 }
1734
1735 if (isIndirectCall) {
1736 // This is indirect function call case : PTX requires a prototype of the
1737 // form
1738 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1739 // to be emitted, and the label has to used as the last arg of call
1740 // instruction.
1741 // The prototype is embedded in a string and put as the operand for a
1742 // CallPrototype SDNode which will print out to the value of the string.
1743 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1744 std::string Proto = getPrototype(
1745 DL, RetTy, Args, Outs, retAlignment,
1746 HasVAArgs
1747 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1748 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1749 : std::nullopt,
1750 *CB, UniqueCallSite);
1751 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1752 SDValue ProtoOps[] = {
1753 Chain,
1754 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1755 InGlue,
1756 };
1757 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1758 InGlue = Chain.getValue(1);
1759 }
1760 // Op to just print "call"
1761 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1762 SDValue PrintCallOps[] = {
1763 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1764 };
1765 // We model convergent calls as separate opcodes.
1767 if (CLI.IsConvergent)
1770 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1771 InGlue = Chain.getValue(1);
1772
1773 if (ConvertToIndirectCall) {
1774 // Copy the function ptr to a ptx register and use the register to call the
1775 // function.
1776 EVT DestVT = Callee.getValueType();
1778 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
1779 unsigned DestReg =
1780 RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT()));
1781 auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
1782 Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
1783 }
1784
1785 // Ops to print out the function name
1786 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1787 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
1788 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1789 InGlue = Chain.getValue(1);
1790
1791 // Ops to print out the param list
1792 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1793 SDValue CallArgBeginOps[] = { Chain, InGlue };
1794 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1795 CallArgBeginOps);
1796 InGlue = Chain.getValue(1);
1797
1798 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
1799 ++i) {
1800 unsigned opcode;
1801 if (i == (e - 1))
1802 opcode = NVPTXISD::LastCallArg;
1803 else
1804 opcode = NVPTXISD::CallArg;
1805 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1806 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1807 DAG.getConstant(i, dl, MVT::i32), InGlue };
1808 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1809 InGlue = Chain.getValue(1);
1810 }
1811 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1812 SDValue CallArgEndOps[] = { Chain,
1813 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1814 InGlue };
1815 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1816 InGlue = Chain.getValue(1);
1817
1818 if (isIndirectCall) {
1819 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1820 SDValue PrototypeOps[] = {
1821 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
1822 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1823 InGlue = Chain.getValue(1);
1824 }
1825
1826 SmallVector<SDValue, 16> ProxyRegOps;
1827 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
1828 // An item of the vector is filled if the element does not need a ProxyReg
1829 // operation on it and should be added to InVals as is. ProxyRegOps and
1830 // ProxyRegTruncates contain empty/none items at the same index.
1832 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
1833 // to use the values of `LoadParam`s and to be replaced later then
1834 // `CALLSEQ_END` is added.
1835 SmallVector<SDValue, 16> TempProxyRegOps;
1836
1837 // Generate loads from param memory/moves from registers for result
1838 if (Ins.size() > 0) {
1841 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1842 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1843
1844 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1845 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1846
1847 SmallVector<EVT, 6> LoadVTs;
1848 int VecIdx = -1; // Index of the first element of the vector.
1849
1850 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1851 // 32-bits are sign extended or zero extended, depending on whether
1852 // they are signed or unsigned types.
1853 bool ExtendIntegerRetVal =
1854 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1855
1856 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1857 bool needTruncate = false;
1858 EVT TheLoadType = VTs[i];
1859 EVT EltType = Ins[i].VT;
1860 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
1861 MVT PromotedVT;
1862
1863 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
1864 TheLoadType = EVT(PromotedVT);
1865 EltType = EVT(PromotedVT);
1866 needTruncate = true;
1867 }
1868
1869 if (ExtendIntegerRetVal) {
1870 TheLoadType = MVT::i32;
1871 EltType = MVT::i32;
1872 needTruncate = true;
1873 } else if (TheLoadType.getSizeInBits() < 16) {
1874 if (VTs[i].isInteger())
1875 needTruncate = true;
1876 EltType = MVT::i16;
1877 }
1878
1879 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1880 // scalar load. In such cases, fall back to byte loads.
1881 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
1882 EltAlign < DL.getABITypeAlign(
1883 TheLoadType.getTypeForEVT(*DAG.getContext()))) {
1884 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1886 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
1887 ProxyRegOps.push_back(SDValue());
1888 ProxyRegTruncates.push_back(std::optional<MVT>());
1889 RetElts.resize(i);
1890 RetElts.push_back(Ret);
1891
1892 continue;
1893 }
1894
1895 // Record index of the very first element of the vector.
1896 if (VectorInfo[i] & PVF_FIRST) {
1897 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1898 VecIdx = i;
1899 }
1900
1901 LoadVTs.push_back(EltType);
1902
1903 if (VectorInfo[i] & PVF_LAST) {
1904 unsigned NumElts = LoadVTs.size();
1905 LoadVTs.push_back(MVT::Other);
1906 LoadVTs.push_back(MVT::Glue);
1908 switch (NumElts) {
1909 case 1:
1911 break;
1912 case 2:
1914 break;
1915 case 4:
1917 break;
1918 default:
1919 llvm_unreachable("Invalid vector info.");
1920 }
1921
1922 SDValue LoadOperands[] = {
1923 Chain, DAG.getConstant(1, dl, MVT::i32),
1924 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
1925 SDValue RetVal = DAG.getMemIntrinsicNode(
1926 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1927 MachinePointerInfo(), EltAlign,
1929
1930 for (unsigned j = 0; j < NumElts; ++j) {
1931 ProxyRegOps.push_back(RetVal.getValue(j));
1932
1933 if (needTruncate)
1934 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
1935 else
1936 ProxyRegTruncates.push_back(std::optional<MVT>());
1937 }
1938
1939 Chain = RetVal.getValue(NumElts);
1940 InGlue = RetVal.getValue(NumElts + 1);
1941
1942 // Cleanup
1943 VecIdx = -1;
1944 LoadVTs.clear();
1945 }
1946 }
1947 }
1948
1949 Chain =
1950 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
1951 InGlue = Chain.getValue(1);
1952
1953 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1954 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1955 // dangling.
1956 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1957 if (i < RetElts.size() && RetElts[i]) {
1958 InVals.push_back(RetElts[i]);
1959 continue;
1960 }
1961
1962 SDValue Ret = DAG.getNode(
1964 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1965 { Chain, ProxyRegOps[i], InGlue }
1966 );
1967
1968 Chain = Ret.getValue(1);
1969 InGlue = Ret.getValue(2);
1970
1971 if (ProxyRegTruncates[i]) {
1972 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
1973 }
1974
1975 InVals.push_back(Ret);
1976 }
1977
1978 for (SDValue &T : TempProxyRegOps) {
1979 SDValue Repl = DAG.getNode(
1981 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
1982 {Chain, T.getOperand(0), InGlue});
1983 DAG.ReplaceAllUsesWith(T, Repl);
1984 DAG.RemoveDeadNode(T.getNode());
1985
1986 Chain = Repl.getValue(1);
1987 InGlue = Repl.getValue(2);
1988 }
1989
1990 // set isTailCall to false for now, until we figure out how to express
1991 // tail call optimization in PTX
1992 isTailCall = false;
1993 return Chain;
1994}
1995
1997 SelectionDAG &DAG) const {
1998
1999 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2000 const Function &Fn = DAG.getMachineFunction().getFunction();
2001
2002 DiagnosticInfoUnsupported NoDynamicAlloca(
2003 Fn,
2004 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2005 "requires target sm_52.",
2006 SDLoc(Op).getDebugLoc());
2007 DAG.getContext()->diagnose(NoDynamicAlloca);
2008 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2009 Op.getOperand(0)};
2010 return DAG.getMergeValues(Ops, SDLoc());
2011 }
2012
2013 SDValue Chain = Op.getOperand(0);
2014 SDValue Size = Op.getOperand(1);
2015 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2016 SDLoc DL(Op.getNode());
2017
2018 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2019 MVT ValueSizeTy = nvTM->is64Bit() ? MVT::i64 : MVT::i32;
2020
2021 SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size, DL, ValueSizeTy),
2022 DAG.getTargetConstant(Align, DL, MVT::i32)};
2023 EVT RetTypes[] = {ValueSizeTy, MVT::Other};
2024 return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, RetTypes, AllocOps);
2025}
2026
2028 SelectionDAG &DAG) const {
2029 SDLoc DL(Op.getNode());
2030 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2031 const Function &Fn = DAG.getMachineFunction().getFunction();
2032
2033 DiagnosticInfoUnsupported NoStackRestore(
2034 Fn,
2035 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
2036 ">= sm_52.",
2037 DL.getDebugLoc());
2038 DAG.getContext()->diagnose(NoStackRestore);
2039 return Op.getOperand(0);
2040 }
2041
2042 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
2043 SDValue Chain = Op.getOperand(0);
2044 SDValue Ptr = Op.getOperand(1);
2047 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
2048}
2049
2051 SelectionDAG &DAG) const {
2052 SDLoc DL(Op.getNode());
2053 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2054 const Function &Fn = DAG.getMachineFunction().getFunction();
2055
2056 DiagnosticInfoUnsupported NoStackSave(
2057 Fn,
2058 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
2059 "sm_52.",
2060 DL.getDebugLoc());
2061 DAG.getContext()->diagnose(NoStackSave);
2062 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
2063 return DAG.getMergeValues(Ops, DL);
2064 }
2065
2066 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
2067 SDValue Chain = Op.getOperand(0);
2068 SDValue SS =
2069 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
2070 SDValue ASC = DAG.getAddrSpaceCast(
2071 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
2072 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
2073}
2074
2075// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2076// (see LegalizeDAG.cpp). This is slow and uses local memory.
2077// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2078SDValue
2079NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2080 SDNode *Node = Op.getNode();
2081 SDLoc dl(Node);
2083 unsigned NumOperands = Node->getNumOperands();
2084 for (unsigned i = 0; i < NumOperands; ++i) {
2085 SDValue SubOp = Node->getOperand(i);
2086 EVT VVT = SubOp.getNode()->getValueType(0);
2087 EVT EltVT = VVT.getVectorElementType();
2088 unsigned NumSubElem = VVT.getVectorNumElements();
2089 for (unsigned j = 0; j < NumSubElem; ++j) {
2090 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2091 DAG.getIntPtrConstant(j, dl)));
2092 }
2093 }
2094 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2095}
2096
2097SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2098 // Handle bitcasting from v2i8 without hitting the default promotion
2099 // strategy which goes through stack memory.
2100 EVT FromVT = Op->getOperand(0)->getValueType(0);
2101 if (FromVT != MVT::v2i8) {
2102 return Op;
2103 }
2104
2105 // Pack vector elements into i16 and bitcast to final type
2106 SDLoc DL(Op);
2107 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2108 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2109 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2110 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2111 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2112 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2113 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2114 SDValue AsInt = DAG.getNode(
2115 ISD::OR, DL, MVT::i16,
2116 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2117 EVT ToVT = Op->getValueType(0);
2118 return MaybeBitcast(DAG, DL, ToVT, AsInt);
2119}
2120
2121// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2122// would get lowered as two constant loads and vector-packing move.
2123// Instead we want just a constant move:
2124// mov.b32 %r2, 0x40003C00
2125SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2126 SelectionDAG &DAG) const {
2127 EVT VT = Op->getValueType(0);
2128 if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2129 return Op;
2130 SDLoc DL(Op);
2131
2132 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2133 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2134 isa<ConstantFPSDNode>(Operand);
2135 })) {
2136 if (VT != MVT::v4i8)
2137 return Op;
2138 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2139 // to optimize calculation of constant parts.
2140 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2141 uint64_t SelectionValue) -> SDValue {
2142 SDValue L = Left;
2143 SDValue R = Right;
2144 if (Cast) {
2145 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2146 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2147 }
2148 return DAG.getNode(
2149 NVPTXISD::PRMT, DL, MVT::v4i8,
2150 {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32),
2151 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
2152 };
2153 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2154 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2155 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2156 return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
2157 }
2158
2159 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2160 auto GetOperand = [](SDValue Op, int N) -> APInt {
2161 const SDValue &Operand = Op->getOperand(N);
2162 EVT VT = Op->getValueType(0);
2163 if (Operand->isUndef())
2164 return APInt(32, 0);
2165 APInt Value;
2166 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2167 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2168 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2169 Value = Operand->getAsAPIntVal();
2170 else
2171 llvm_unreachable("Unsupported type");
2172 // i8 values are carried around as i16, so we need to zero out upper bits,
2173 // so they do not get in the way of combining individual byte values
2174 if (VT == MVT::v4i8)
2175 Value = Value.trunc(8);
2176 return Value.zext(32);
2177 };
2178 APInt Value;
2179 if (Isv2x16VT(VT)) {
2180 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2181 } else if (VT == MVT::v4i8) {
2182 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2183 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2184 } else {
2185 llvm_unreachable("Unsupported type");
2186 }
2187 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2188 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2189}
2190
2191SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2192 SelectionDAG &DAG) const {
2193 SDValue Index = Op->getOperand(1);
2194 SDValue Vector = Op->getOperand(0);
2195 SDLoc DL(Op);
2196 EVT VectorVT = Vector.getValueType();
2197
2198 if (VectorVT == MVT::v4i8) {
2199 SDValue BFE =
2200 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2201 {Vector,
2202 DAG.getNode(ISD::MUL, DL, MVT::i32,
2203 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2204 DAG.getConstant(8, DL, MVT::i32)),
2205 DAG.getConstant(8, DL, MVT::i32)});
2206 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2207 }
2208
2209 // Constant index will be matched by tablegen.
2210 if (isa<ConstantSDNode>(Index.getNode()))
2211 return Op;
2212
2213 // Extract individual elements and select one of them.
2214 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2215 EVT EltVT = VectorVT.getVectorElementType();
2216
2217 SDLoc dl(Op.getNode());
2218 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2219 DAG.getIntPtrConstant(0, dl));
2220 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2221 DAG.getIntPtrConstant(1, dl));
2222 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2224}
2225
2226SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2227 SelectionDAG &DAG) const {
2228 SDValue Vector = Op->getOperand(0);
2229 EVT VectorVT = Vector.getValueType();
2230
2231 if (VectorVT != MVT::v4i8)
2232 return Op;
2233 SDLoc DL(Op);
2234 SDValue Value = Op->getOperand(1);
2235 if (Value->isUndef())
2236 return Vector;
2237
2238 SDValue Index = Op->getOperand(2);
2239
2240 SDValue BFI =
2241 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2242 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2243 DAG.getNode(ISD::MUL, DL, MVT::i32,
2244 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2245 DAG.getConstant(8, DL, MVT::i32)),
2246 DAG.getConstant(8, DL, MVT::i32)});
2247 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2248}
2249
2250SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2251 SelectionDAG &DAG) const {
2252 SDValue V1 = Op.getOperand(0);
2253 EVT VectorVT = V1.getValueType();
2254 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2255 return Op;
2256
2257 // Lower shuffle to PRMT instruction.
2258 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2259 SDValue V2 = Op.getOperand(1);
2260 uint32_t Selector = 0;
2261 for (auto I : llvm::enumerate(SVN->getMask())) {
2262 if (I.value() != -1) // -1 is a placeholder for undef.
2263 Selector |= (I.value() << (I.index() * 4));
2264 }
2265
2266 SDLoc DL(Op);
2267 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2268 DAG.getConstant(Selector, DL, MVT::i32),
2269 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2270}
2271/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2272/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2273/// amount, or
2274/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2275/// amount.
2276SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2277 SelectionDAG &DAG) const {
2278 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2279 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2280
2281 EVT VT = Op.getValueType();
2282 unsigned VTBits = VT.getSizeInBits();
2283 SDLoc dl(Op);
2284 SDValue ShOpLo = Op.getOperand(0);
2285 SDValue ShOpHi = Op.getOperand(1);
2286 SDValue ShAmt = Op.getOperand(2);
2287 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2288
2289 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2290 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2291 // {dHi, dLo} = {aHi, aLo} >> Amt
2292 // dHi = aHi >> Amt
2293 // dLo = shf.r.clamp aLo, aHi, Amt
2294
2295 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2296 SDValue Lo =
2297 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2298
2299 SDValue Ops[2] = { Lo, Hi };
2300 return DAG.getMergeValues(Ops, dl);
2301 }
2302 else {
2303 // {dHi, dLo} = {aHi, aLo} >> Amt
2304 // - if (Amt>=size) then
2305 // dLo = aHi >> (Amt-size)
2306 // dHi = aHi >> Amt (this is either all 0 or all 1)
2307 // else
2308 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2309 // dHi = aHi >> Amt
2310
2311 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2312 DAG.getConstant(VTBits, dl, MVT::i32),
2313 ShAmt);
2314 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2315 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2316 DAG.getConstant(VTBits, dl, MVT::i32));
2317 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2318 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2319 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2320
2321 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2322 DAG.getConstant(VTBits, dl, MVT::i32),
2323 ISD::SETGE);
2324 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2325 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2326
2327 SDValue Ops[2] = { Lo, Hi };
2328 return DAG.getMergeValues(Ops, dl);
2329 }
2330}
2331
2332/// LowerShiftLeftParts - Lower SHL_PARTS, which
2333/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2334/// amount, or
2335/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2336/// amount.
2337SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2338 SelectionDAG &DAG) const {
2339 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2340 assert(Op.getOpcode() == ISD::SHL_PARTS);
2341
2342 EVT VT = Op.getValueType();
2343 unsigned VTBits = VT.getSizeInBits();
2344 SDLoc dl(Op);
2345 SDValue ShOpLo = Op.getOperand(0);
2346 SDValue ShOpHi = Op.getOperand(1);
2347 SDValue ShAmt = Op.getOperand(2);
2348
2349 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2350 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2351 // {dHi, dLo} = {aHi, aLo} << Amt
2352 // dHi = shf.l.clamp aLo, aHi, Amt
2353 // dLo = aLo << Amt
2354
2355 SDValue Hi =
2356 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2357 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2358
2359 SDValue Ops[2] = { Lo, Hi };
2360 return DAG.getMergeValues(Ops, dl);
2361 }
2362 else {
2363 // {dHi, dLo} = {aHi, aLo} << Amt
2364 // - if (Amt>=size) then
2365 // dLo = aLo << Amt (all 0)
2366 // dLo = aLo << (Amt-size)
2367 // else
2368 // dLo = aLo << Amt
2369 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2370
2371 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2372 DAG.getConstant(VTBits, dl, MVT::i32),
2373 ShAmt);
2374 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2375 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2376 DAG.getConstant(VTBits, dl, MVT::i32));
2377 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2378 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2379 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2380
2381 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2382 DAG.getConstant(VTBits, dl, MVT::i32),
2383 ISD::SETGE);
2384 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2385 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2386
2387 SDValue Ops[2] = { Lo, Hi };
2388 return DAG.getMergeValues(Ops, dl);
2389 }
2390}
2391
2392/// If the types match, convert the generic copysign to the NVPTXISD version,
2393/// otherwise bail ensuring that mismatched cases are properly expaned.
2394SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2395 SelectionDAG &DAG) const {
2396 EVT VT = Op.getValueType();
2397 SDLoc DL(Op);
2398
2399 SDValue In1 = Op.getOperand(0);
2400 SDValue In2 = Op.getOperand(1);
2401 EVT SrcVT = In2.getValueType();
2402
2403 if (!SrcVT.bitsEq(VT))
2404 return SDValue();
2405
2406 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2407}
2408
2409SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2410 EVT VT = Op.getValueType();
2411
2412 if (VT == MVT::f32)
2413 return LowerFROUND32(Op, DAG);
2414
2415 if (VT == MVT::f64)
2416 return LowerFROUND64(Op, DAG);
2417
2418 llvm_unreachable("unhandled type");
2419}
2420
2421// This is the the rounding method used in CUDA libdevice in C like code:
2422// float roundf(float A)
2423// {
2424// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2425// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2426// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2427// }
2428SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2429 SelectionDAG &DAG) const {
2430 SDLoc SL(Op);
2431 SDValue A = Op.getOperand(0);
2432 EVT VT = Op.getValueType();
2433
2434 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2435
2436 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2437 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2438 const unsigned SignBitMask = 0x80000000;
2439 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2440 DAG.getConstant(SignBitMask, SL, MVT::i32));
2441 const unsigned PointFiveInBits = 0x3F000000;
2442 SDValue PointFiveWithSignRaw =
2443 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2444 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2445 SDValue PointFiveWithSign =
2446 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2447 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2448 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2449
2450 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2451 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2452 SDValue IsLarge =
2453 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2454 ISD::SETOGT);
2455 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2456
2457 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2458 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2459 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2460 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2461 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2462}
2463
2464// The implementation of round(double) is similar to that of round(float) in
2465// that they both separate the value range into three regions and use a method
2466// specific to the region to round the values. However, round(double) first
2467// calculates the round of the absolute value and then adds the sign back while
2468// round(float) directly rounds the value with sign.
2469SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2470 SelectionDAG &DAG) const {
2471 SDLoc SL(Op);
2472 SDValue A = Op.getOperand(0);
2473 EVT VT = Op.getValueType();
2474
2475 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2476
2477 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2478 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2479 DAG.getConstantFP(0.5, SL, VT));
2480 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2481
2482 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2483 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2484 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2485 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2486 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2487 DAG.getConstantFP(0, SL, VT),
2488 RoundedA);
2489
2490 // Add sign to rounded_A
2491 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2492 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2493
2494 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2495 SDValue IsLarge =
2496 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2497 ISD::SETOGT);
2498 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2499}
2500
2501SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2502 SelectionDAG &DAG) const {
2503 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2504
2505 if (Op.getValueType() == MVT::bf16) {
2506 SDLoc Loc(Op);
2507 return DAG.getNode(
2508 ISD::FP_ROUND, Loc, MVT::bf16,
2509 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2510 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2511 }
2512
2513 // Everything else is considered legal.
2514 return Op;
2515}
2516
2517SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2518 SelectionDAG &DAG) const {
2519 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2520
2521 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2522 SDLoc Loc(Op);
2523 return DAG.getNode(
2524 Op.getOpcode(), Loc, Op.getValueType(),
2525 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2526 }
2527
2528 // Everything else is considered legal.
2529 return Op;
2530}
2531
2532SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2533 SelectionDAG &DAG) const {
2534 EVT NarrowVT = Op.getValueType();
2535 SDValue Wide = Op.getOperand(0);
2536 EVT WideVT = Wide.getValueType();
2537 if (NarrowVT.getScalarType() == MVT::bf16) {
2538 const TargetLowering *TLI = STI.getTargetLowering();
2539 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2540 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2541 }
2542 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2543 // This combination was the first to support f32 -> bf16.
2544 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2545 if (WideVT.getScalarType() == MVT::f32) {
2546 return Op;
2547 }
2548 if (WideVT.getScalarType() == MVT::f64) {
2549 SDLoc Loc(Op);
2550 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2551 // the hardware f32 -> bf16 instruction.
2553 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2554 : MVT::f32,
2555 Wide, Loc, DAG);
2556 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2557 }
2558 }
2559 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2560 }
2561 }
2562
2563 // Everything else is considered legal.
2564 return Op;
2565}
2566
2567SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2568 SelectionDAG &DAG) const {
2569 SDValue Narrow = Op.getOperand(0);
2570 EVT NarrowVT = Narrow.getValueType();
2571 EVT WideVT = Op.getValueType();
2572 if (NarrowVT.getScalarType() == MVT::bf16) {
2573 if (WideVT.getScalarType() == MVT::f32 &&
2574 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2575 SDLoc Loc(Op);
2576 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2577 }
2578 if (WideVT.getScalarType() == MVT::f64 &&
2579 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2580 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2581 : MVT::f32;
2582 SDLoc Loc(Op);
2583 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2584 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2585 } else {
2586 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2587 }
2588 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2589 }
2590 }
2591
2592 // Everything else is considered legal.
2593 return Op;
2594}
2595
2597 SDLoc DL(Op);
2598 if (Op.getValueType() != MVT::v2i16)
2599 return Op;
2600 EVT EltVT = Op.getValueType().getVectorElementType();
2601 SmallVector<SDValue> VecElements;
2602 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2603 SmallVector<SDValue> ScalarArgs;
2604 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2605 [&](const SDUse &O) {
2606 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2607 O.get(), DAG.getIntPtrConstant(I, DL));
2608 });
2609 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2610 }
2611 SDValue V =
2612 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2613 return V;
2614}
2615
2616SDValue
2618 switch (Op.getOpcode()) {
2619 case ISD::RETURNADDR:
2620 return SDValue();
2621 case ISD::FRAMEADDR:
2622 return SDValue();
2623 case ISD::GlobalAddress:
2624 return LowerGlobalAddress(Op, DAG);
2626 return Op;
2627 case ISD::BUILD_VECTOR:
2628 return LowerBUILD_VECTOR(Op, DAG);
2629 case ISD::BITCAST:
2630 return LowerBITCAST(Op, DAG);
2632 return Op;
2634 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2636 return LowerINSERT_VECTOR_ELT(Op, DAG);
2638 return LowerVECTOR_SHUFFLE(Op, DAG);
2640 return LowerCONCAT_VECTORS(Op, DAG);
2641 case ISD::STORE:
2642 return LowerSTORE(Op, DAG);
2643 case ISD::LOAD:
2644 return LowerLOAD(Op, DAG);
2645 case ISD::SHL_PARTS:
2646 return LowerShiftLeftParts(Op, DAG);
2647 case ISD::SRA_PARTS:
2648 case ISD::SRL_PARTS:
2649 return LowerShiftRightParts(Op, DAG);
2650 case ISD::SELECT:
2651 return LowerSelect(Op, DAG);
2652 case ISD::FROUND:
2653 return LowerFROUND(Op, DAG);
2654 case ISD::FCOPYSIGN:
2655 return LowerFCOPYSIGN(Op, DAG);
2656 case ISD::SINT_TO_FP:
2657 case ISD::UINT_TO_FP:
2658 return LowerINT_TO_FP(Op, DAG);
2659 case ISD::FP_TO_SINT:
2660 case ISD::FP_TO_UINT:
2661 return LowerFP_TO_INT(Op, DAG);
2662 case ISD::FP_ROUND:
2663 return LowerFP_ROUND(Op, DAG);
2664 case ISD::FP_EXTEND:
2665 return LowerFP_EXTEND(Op, DAG);
2666 case ISD::BR_JT:
2667 return LowerBR_JT(Op, DAG);
2668 case ISD::VAARG:
2669 return LowerVAARG(Op, DAG);
2670 case ISD::VASTART:
2671 return LowerVASTART(Op, DAG);
2672 case ISD::ABS:
2673 case ISD::SMIN:
2674 case ISD::SMAX:
2675 case ISD::UMIN:
2676 case ISD::UMAX:
2677 case ISD::ADD:
2678 case ISD::SUB:
2679 case ISD::MUL:
2680 case ISD::SHL:
2681 case ISD::SREM:
2682 case ISD::UREM:
2683 return LowerVectorArith(Op, DAG);
2685 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2686 case ISD::STACKRESTORE:
2687 return LowerSTACKRESTORE(Op, DAG);
2688 case ISD::STACKSAVE:
2689 return LowerSTACKSAVE(Op, DAG);
2690 case ISD::CopyToReg:
2691 return LowerCopyToReg_128(Op, DAG);
2692 default:
2693 llvm_unreachable("Custom lowering not defined for operation");
2694 }
2695}
2696
2697SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
2698 SDLoc DL(Op);
2699 SDValue Chain = Op.getOperand(0);
2700 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
2701 SDValue Index = Op.getOperand(2);
2702
2703 unsigned JId = JT->getIndex();
2705 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2706
2707 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
2708
2709 // Generate BrxStart node
2710 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2711 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
2712
2713 // Generate BrxItem nodes
2714 assert(!MBBs.empty());
2715 for (MachineBasicBlock *MBB : MBBs.drop_back())
2716 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
2717 DAG.getBasicBlock(MBB), Chain.getValue(1));
2718
2719 // Generate BrxEnd nodes
2720 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
2721 IdV, Chain.getValue(1)};
2722 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
2723
2724 return BrxEnd;
2725}
2726
2727// This will prevent AsmPrinter from trying to print the jump tables itself.
2730}
2731
2732// This function is almost a copy of SelectionDAG::expandVAArg().
2733// The only diff is that this one produces loads from local address space.
2734SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2735 const TargetLowering *TLI = STI.getTargetLowering();
2736 SDLoc DL(Op);
2737
2738 SDNode *Node = Op.getNode();
2739 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2740 EVT VT = Node->getValueType(0);
2741 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2742 SDValue Tmp1 = Node->getOperand(0);
2743 SDValue Tmp2 = Node->getOperand(1);
2744 const MaybeAlign MA(Node->getConstantOperandVal(3));
2745
2746 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2747 Tmp1, Tmp2, MachinePointerInfo(V));
2748 SDValue VAList = VAListLoad;
2749
2750 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2751 VAList = DAG.getNode(
2752 ISD::ADD, DL, VAList.getValueType(), VAList,
2753 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2754
2755 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
2756 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
2757 VAList.getValueType()));
2758 }
2759
2760 // Increment the pointer, VAList, to the next vaarg
2761 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2763 DL, VAList.getValueType()));
2764
2765 // Store the incremented VAList to the legalized pointer
2766 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2768
2769 const Value *SrcV =
2771
2772 // Load the actual argument out of the pointer VAList
2773 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2774}
2775
2776SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2777 const TargetLowering *TLI = STI.getTargetLowering();
2778 SDLoc DL(Op);
2779 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2780
2781 // Store the address of unsized array <function>_vararg[] in the ap object.
2782 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2783 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2784
2785 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2786 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2787 MachinePointerInfo(SV));
2788}
2789
2790SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2791 SDValue Op0 = Op->getOperand(0);
2792 SDValue Op1 = Op->getOperand(1);
2793 SDValue Op2 = Op->getOperand(2);
2794 SDLoc DL(Op.getNode());
2795
2796 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2797
2798 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2799 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2800 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2801 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2802
2803 return Trunc;
2804}
2805
2806SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2807 if (Op.getValueType() == MVT::i1)
2808 return LowerLOADi1(Op, DAG);
2809
2810 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2811 // unaligned loads and have to handle it here.
2812 EVT VT = Op.getValueType();
2813 if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2814 LoadSDNode *Load = cast<LoadSDNode>(Op);
2815 EVT MemVT = Load->getMemoryVT();
2817 MemVT, *Load->getMemOperand())) {
2818 SDValue Ops[2];
2819 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2820 return DAG.getMergeValues(Ops, SDLoc(Op));
2821 }
2822 }
2823
2824 return SDValue();
2825}
2826
2827// v = ld i1* addr
2828// =>
2829// v1 = ld i8* addr (-> i16)
2830// v = trunc i16 to i1
2831SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2832 SDNode *Node = Op.getNode();
2833 LoadSDNode *LD = cast<LoadSDNode>(Node);
2834 SDLoc dl(Node);
2835 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2836 assert(Node->getValueType(0) == MVT::i1 &&
2837 "Custom lowering for i1 load only");
2838 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
2839 LD->getBasePtr(), LD->getPointerInfo(),
2840 MVT::i8, LD->getAlign(),
2841 LD->getMemOperand()->getFlags());
2842 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2843 // The legalizer (the caller) is expecting two values from the legalized
2844 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2845 // in LegalizeDAG.cpp which also uses MergeValues.
2846 SDValue Ops[] = { result, LD->getChain() };
2847 return DAG.getMergeValues(Ops, dl);
2848}
2849
2850SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2851 StoreSDNode *Store = cast<StoreSDNode>(Op);
2852 EVT VT = Store->getMemoryVT();
2853
2854 if (VT == MVT::i1)
2855 return LowerSTOREi1(Op, DAG);
2856
2857 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2858 // stores and have to handle it here.
2859 if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2861 VT, *Store->getMemOperand()))
2862 return expandUnalignedStore(Store, DAG);
2863
2864 // v2f16, v2bf16 and v2i16 don't need special handling.
2865 if (Isv2x16VT(VT) || VT == MVT::v4i8)
2866 return SDValue();
2867
2868 if (VT.isVector())
2869 return LowerSTOREVector(Op, DAG);
2870
2871 return SDValue();
2872}
2873
2874SDValue
2875NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2876 SDNode *N = Op.getNode();
2877 SDValue Val = N->getOperand(1);
2878 SDLoc DL(N);
2879 EVT ValVT = Val.getValueType();
2880
2881 auto NumEltsAndEltVT = getVectorLoweringShape(ValVT);
2882 if (!NumEltsAndEltVT)
2883 return SDValue();
2884 auto [NumElts, EltVT] = NumEltsAndEltVT.value();
2885
2886 MemSDNode *MemSD = cast<MemSDNode>(N);
2887 const DataLayout &TD = DAG.getDataLayout();
2888
2889 Align Alignment = MemSD->getAlign();
2890 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2891 if (Alignment < PrefAlign) {
2892 // This store is not sufficiently aligned, so bail out and let this vector
2893 // store be scalarized. Note that we may still be able to emit smaller
2894 // vector stores. For example, if we are storing a <4 x float> with an
2895 // alignment of 8, this check will fail but the legalizer will try again
2896 // with 2 x <2 x float>, which will succeed with an alignment of 8.
2897 return SDValue();
2898 }
2899
2900 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2901 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2902 // stored type to i16 and propagate the "real" type as the memory type.
2903 bool NeedExt = false;
2904 if (EltVT.getSizeInBits() < 16)
2905 NeedExt = true;
2906
2907 unsigned Opcode = 0;
2908 switch (NumElts) {
2909 default:
2910 return SDValue();
2911 case 2:
2912 Opcode = NVPTXISD::StoreV2;
2913 break;
2914 case 4:
2915 Opcode = NVPTXISD::StoreV4;
2916 break;
2917 }
2918
2920
2921 // First is the chain
2922 Ops.push_back(N->getOperand(0));
2923
2924 // Then the split values
2925 assert(NumElts <= ValVT.getVectorNumElements() &&
2926 "NumElts should not increase, only decrease or stay the same.");
2927 if (NumElts < ValVT.getVectorNumElements()) {
2928 // If the number of elements has decreased, getVectorLoweringShape has
2929 // upsized the element types
2930 assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
2931 EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type.");
2932 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
2933 // stored as b32s
2934 unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
2935 for (unsigned i = 0; i < NumElts; ++i) {
2936 SmallVector<SDValue, 4> SubVectorElts;
2937 DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector,
2938 NumEltsPerSubVector);
2939 SDValue SubVector = DAG.getBuildVector(EltVT, DL, SubVectorElts);
2940 Ops.push_back(SubVector);
2941 }
2942 } else {
2943 for (unsigned i = 0; i < NumElts; ++i) {
2944 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2945 DAG.getIntPtrConstant(i, DL));
2946 if (NeedExt)
2947 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2948 Ops.push_back(ExtVal);
2949 }
2950 }
2951
2952 // Then any remaining arguments
2953 Ops.append(N->op_begin() + 2, N->op_end());
2954
2955 SDValue NewSt =
2956 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2957 MemSD->getMemoryVT(), MemSD->getMemOperand());
2958
2959 // return DCI.CombineTo(N, NewSt, true);
2960 return NewSt;
2961}
2962
2963// st i1 v, addr
2964// =>
2965// v1 = zxt v to i16
2966// st.u8 i16, addr
2967SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2968 SDNode *Node = Op.getNode();
2969 SDLoc dl(Node);
2970 StoreSDNode *ST = cast<StoreSDNode>(Node);
2971 SDValue Tmp1 = ST->getChain();
2972 SDValue Tmp2 = ST->getBasePtr();
2973 SDValue Tmp3 = ST->getValue();
2974 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2975 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2976 SDValue Result =
2977 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2978 ST->getAlign(), ST->getMemOperand()->getFlags());
2979 return Result;
2980}
2981
2982SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
2983 SelectionDAG &DAG) const {
2984 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
2985 // operand so that it can pass the legalization.
2986
2987 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
2988 "Custom lowering for 128-bit CopyToReg only");
2989
2990 SDNode *Node = Op.getNode();
2991 SDLoc DL(Node);
2992
2993 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
2994 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2995 DAG.getIntPtrConstant(0, DL));
2996 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2997 DAG.getIntPtrConstant(1, DL));
2998
3000 SmallVector<EVT, 3> ResultsType(Node->values());
3001
3002 NewOps[0] = Op->getOperand(0); // Chain
3003 NewOps[1] = Op->getOperand(1); // Dst Reg
3004 NewOps[2] = Lo; // Lower 64-bit
3005 NewOps[3] = Hi; // Higher 64-bit
3006 if (Op.getNumOperands() == 4)
3007 NewOps[4] = Op->getOperand(3); // Glue if exists
3008
3009 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3010}
3011
3012unsigned NVPTXTargetLowering::getNumRegisters(
3013 LLVMContext &Context, EVT VT,
3014 std::optional<MVT> RegisterVT = std::nullopt) const {
3015 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3016 return 1;
3017 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3018}
3019
3020bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3021 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3022 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3023 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3024 Parts[0] = Val;
3025 return true;
3026 }
3027 return false;
3028}
3029
3030// This creates target external symbol for a function parameter.
3031// Name of the symbol is composed from its index and the function name.
3032// Negative index corresponds to special parameter (unsized array) used for
3033// passing variable arguments.
3034SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3035 EVT v) const {
3036 StringRef SavedStr = nvTM->getStrPool().save(
3038 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3039}
3040
3042 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3043 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3044 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3046 const DataLayout &DL = DAG.getDataLayout();
3047 auto PtrVT = getPointerTy(DAG.getDataLayout());
3048
3049 const Function *F = &MF.getFunction();
3050 const AttributeList &PAL = F->getAttributes();
3051 const TargetLowering *TLI = STI.getTargetLowering();
3052
3053 SDValue Root = DAG.getRoot();
3054 std::vector<SDValue> OutChains;
3055
3056 bool isABI = (STI.getSmVersion() >= 20);
3057 assert(isABI && "Non-ABI compilation is not supported");
3058 if (!isABI)
3059 return Chain;
3060
3061 std::vector<Type *> argTypes;
3062 std::vector<const Argument *> theArgs;
3063 for (const Argument &I : F->args()) {
3064 theArgs.push_back(&I);
3065 argTypes.push_back(I.getType());
3066 }
3067 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3068 // Ins.size() will be larger
3069 // * if there is an aggregate argument with multiple fields (each field
3070 // showing up separately in Ins)
3071 // * if there is a vector argument with more than typical vector-length
3072 // elements (generally if more than 4) where each vector element is
3073 // individually present in Ins.
3074 // So a different index should be used for indexing into Ins.
3075 // See similar issue in LowerCall.
3076 unsigned InsIdx = 0;
3077
3078 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3079 Type *Ty = argTypes[i];
3080
3081 if (theArgs[i]->use_empty()) {
3082 // argument is dead
3083 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3084 SmallVector<EVT, 16> vtparts;
3085
3086 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3087 if (vtparts.empty())
3088 report_fatal_error("Empty parameter types are not supported");
3089
3090 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3091 ++parti) {
3092 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3093 ++InsIdx;
3094 }
3095 if (vtparts.size() > 0)
3096 --InsIdx;
3097 continue;
3098 }
3099 if (Ty->isVectorTy()) {
3100 EVT ObjectVT = getValueType(DL, Ty);
3101 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3102 for (unsigned parti = 0; parti < NumRegs; ++parti) {
3103 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3104 ++InsIdx;
3105 }
3106 if (NumRegs > 0)
3107 --InsIdx;
3108 continue;
3109 }
3110 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3111 continue;
3112 }
3113
3114 // In the following cases, assign a node order of "i+1"
3115 // to newly created nodes. The SDNodes for params have to
3116 // appear in the same order as their order of appearance
3117 // in the original function. "i+1" holds that order.
3118 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3119 bool aggregateIsPacked = false;
3120 if (StructType *STy = dyn_cast<StructType>(Ty))
3121 aggregateIsPacked = STy->isPacked();
3122
3125 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3126 if (VTs.empty())
3127 report_fatal_error("Empty parameter types are not supported");
3128
3131 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3132
3133 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3134 int VecIdx = -1; // Index of the first element of the current vector.
3135 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3136 if (VectorInfo[parti] & PVF_FIRST) {
3137 assert(VecIdx == -1 && "Orphaned vector.");
3138 VecIdx = parti;
3139 }
3140
3141 // That's the last element of this store op.
3142 if (VectorInfo[parti] & PVF_LAST) {
3143 unsigned NumElts = parti - VecIdx + 1;
3144 EVT EltVT = VTs[parti];
3145 // i1 is loaded/stored as i8.
3146 EVT LoadVT = EltVT;
3147 if (EltVT == MVT::i1)
3148 LoadVT = MVT::i8;
3149 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3150 // getLoad needs a vector type, but it can't handle
3151 // vectors which contain v2f16 or v2bf16 elements. So we must load
3152 // using i32 here and then bitcast back.
3153 LoadVT = MVT::i32;
3154
3155 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3156 SDValue VecAddr =
3157 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3158 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3160 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3161
3162 const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3163 if (aggregateIsPacked)
3164 return Align(1);
3165 if (NumElts != 1)
3166 return std::nullopt;
3167 Align PartAlign =
3168 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3169 return commonAlignment(PartAlign, Offsets[parti]);
3170 }();
3171 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3172 MachinePointerInfo(srcValue), PartAlign,
3175 if (P.getNode())
3176 P.getNode()->setIROrder(i + 1);
3177 for (unsigned j = 0; j < NumElts; ++j) {
3178 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3179 DAG.getIntPtrConstant(j, dl));
3180 // We've loaded i1 as an i8 and now must truncate it back to i1
3181 if (EltVT == MVT::i1)
3182 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3183 // v2f16 was loaded as an i32. Now we must bitcast it back.
3184 else if (EltVT != LoadVT)
3185 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3186
3187 // If a promoted integer type is used, truncate down to the original
3188 MVT PromotedVT;
3189 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3190 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3191 }
3192
3193 // Extend the element if necessary (e.g. an i8 is loaded
3194 // into an i16 register)
3195 if (Ins[InsIdx].VT.isInteger() &&
3196 Ins[InsIdx].VT.getFixedSizeInBits() >
3197 LoadVT.getFixedSizeInBits()) {
3198 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3200 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3201 }
3202 InVals.push_back(Elt);
3203 }
3204
3205 // Reset vector tracking state.
3206 VecIdx = -1;
3207 }
3208 ++InsIdx;
3209 }
3210 if (VTs.size() > 0)
3211 --InsIdx;
3212 continue;
3213 }
3214
3215 // Param has ByVal attribute
3216 // Return MoveParam(param symbol).
3217 // Ideally, the param symbol can be returned directly,
3218 // but when SDNode builder decides to use it in a CopyToReg(),
3219 // machine instruction fails because TargetExternalSymbol
3220 // (not lowered) is target dependent, and CopyToReg assumes
3221 // the source is lowered.
3222 EVT ObjectVT = getValueType(DL, Ty);
3223 assert(ObjectVT == Ins[InsIdx].VT &&
3224 "Ins type did not match function type");
3225 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3226 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3227 if (p.getNode())
3228 p.getNode()->setIROrder(i + 1);
3229 InVals.push_back(p);
3230 }
3231
3232 if (!OutChains.empty())
3233 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3234
3235 return Chain;
3236}
3237
3238// Use byte-store when the param adress of the return value is unaligned.
3239// This may happen when the return value is a field of a packed structure.
3241 uint64_t Offset, EVT ElementType,
3242 SDValue RetVal, const SDLoc &dl) {
3243 // Bit logic only works on integer types
3244 if (adjustElementType(ElementType))
3245 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3246
3247 // Store each byte
3248 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3249 // Shift the byte to the last byte position
3250 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3251 DAG.getConstant(i * 8, dl, MVT::i32));
3252 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3253 ShiftVal};
3254 // Trunc store only the last byte by using
3255 // st.param.b8
3256 // The register type can be larger than b8.
3258 DAG.getVTList(MVT::Other), StoreOperands,
3259 MVT::i8, MachinePointerInfo(), std::nullopt,
3261 }
3262 return Chain;
3263}
3264
3265SDValue
3267 bool isVarArg,
3269 const SmallVectorImpl<SDValue> &OutVals,
3270 const SDLoc &dl, SelectionDAG &DAG) const {
3271 const MachineFunction &MF = DAG.getMachineFunction();
3272 const Function &F = MF.getFunction();
3274
3275 bool isABI = (STI.getSmVersion() >= 20);
3276 assert(isABI && "Non-ABI compilation is not supported");
3277 if (!isABI)
3278 return Chain;
3279
3280 const DataLayout &DL = DAG.getDataLayout();
3281 SmallVector<SDValue, 16> PromotedOutVals;
3284 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3285 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3286
3287 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3288 SDValue PromotedOutVal = OutVals[i];
3289 MVT PromotedVT;
3290 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3291 VTs[i] = EVT(PromotedVT);
3292 }
3293 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3295 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3296 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3297 }
3298 PromotedOutVals.push_back(PromotedOutVal);
3299 }
3300
3301 auto VectorInfo = VectorizePTXValueVTs(
3302 VTs, Offsets,
3304 : Align(1));
3305
3306 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3307 // 32-bits are sign extended or zero extended, depending on whether
3308 // they are signed or unsigned types.
3309 bool ExtendIntegerRetVal =
3310 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3311
3312 SmallVector<SDValue, 6> StoreOperands;
3313 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3314 SDValue OutVal = OutVals[i];
3315 SDValue RetVal = PromotedOutVals[i];
3316
3317 if (ExtendIntegerRetVal) {
3318 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3320 dl, MVT::i32, RetVal);
3321 } else if (OutVal.getValueSizeInBits() < 16) {
3322 // Use 16-bit registers for small load-stores as it's the
3323 // smallest general purpose register size supported by NVPTX.
3324 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3325 }
3326
3327 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3328 // for a scalar store. In such cases, fall back to byte stores.
3329 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3330 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3331 Align ElementTypeAlign =
3332 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3333 Align ElementAlign =
3334 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3335 if (ElementAlign < ElementTypeAlign) {
3336 assert(StoreOperands.empty() && "Orphaned operand list.");
3337 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3338 RetVal, dl);
3339
3340 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3341 // into the graph, so just move on to the next element.
3342 continue;
3343 }
3344 }
3345
3346 // New load/store. Record chain and offset operands.
3347 if (VectorInfo[i] & PVF_FIRST) {
3348 assert(StoreOperands.empty() && "Orphaned operand list.");
3349 StoreOperands.push_back(Chain);
3350 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3351 }
3352
3353 // Record the value to return.
3354 StoreOperands.push_back(RetVal);
3355
3356 // That's the last element of this store op.
3357 if (VectorInfo[i] & PVF_LAST) {
3359 unsigned NumElts = StoreOperands.size() - 2;
3360 switch (NumElts) {
3361 case 1:
3363 break;
3364 case 2:
3366 break;
3367 case 4:
3369 break;
3370 default:
3371 llvm_unreachable("Invalid vector info.");
3372 }
3373
3374 // Adjust type of load/store op if we've extended the scalar
3375 // return value.
3376 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3377 Chain = DAG.getMemIntrinsicNode(
3378 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3380 // Cleanup vector state.
3381 StoreOperands.clear();
3382 }
3383 }
3384
3385 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3386}
3387
3389 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3390 SelectionDAG &DAG) const {
3391 if (Constraint.size() > 1)
3392 return;
3393 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3394}
3395
3396// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3397// TgtMemIntrinsic
3398// because we need the information that is only available in the "Value" type
3399// of destination
3400// pointer. In particular, the address space information.
3402 IntrinsicInfo &Info, const CallInst &I,
3403 MachineFunction &MF, unsigned Intrinsic) const {
3404 switch (Intrinsic) {
3405 default:
3406 return false;
3407 case Intrinsic::nvvm_match_all_sync_i32p:
3408 case Intrinsic::nvvm_match_all_sync_i64p:
3410 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3411 // in order to model data exchange with other threads, but perform no real
3412 // memory accesses.
3413 Info.memVT = MVT::i1;
3414
3415 // Our result depends on both our and other thread's arguments.
3417 return true;
3418 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3419 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3420 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3421 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3422 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3423 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3424 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3425 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3426 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3427 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3428 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3429 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3430 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3431 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3432 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3433 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3434 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3435 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3436 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3437 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3438 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3439 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3440 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3441 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3443 Info.memVT = MVT::v8f16;
3444 Info.ptrVal = I.getArgOperand(0);
3445 Info.offset = 0;
3447 Info.align = Align(16);
3448 return true;
3449 }
3450 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3451 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3452 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3453 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3454 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3455 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3456 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3457 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3458 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3459 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3460 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3461 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3462 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3463 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3464 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3465 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3466 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3467 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3468 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3469 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3470 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3471 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3472 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3473 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3475 Info.memVT = MVT::v2i32;
3476 Info.ptrVal = I.getArgOperand(0);
3477 Info.offset = 0;
3479 Info.align = Align(8);
3480 return true;
3481 }
3482
3483 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3484 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3485 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3486 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3487 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3488 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3489 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3490 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3491 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3492 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3493 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3494 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3495 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3496 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3497 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3498 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3499
3500 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3501 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3502 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3503 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3504 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3505 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3506 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3507 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3508 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3509 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3510 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3511 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3512 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3513 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3514 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3515 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3516 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3517 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3519 Info.memVT = MVT::v4i32;
3520 Info.ptrVal = I.getArgOperand(0);
3521 Info.offset = 0;
3523 Info.align = Align(16);
3524 return true;
3525 }
3526
3527 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3528 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3529 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3530 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3531 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3532 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3533 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3534 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3535
3536 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3537 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3538 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3539 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3540 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3541 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3542 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3543 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3544 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3545 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3546 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3547 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3548 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3549 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3550 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3551 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3552 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3553 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3554 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3555 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3556 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3557 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3559 Info.memVT = MVT::i32;
3560 Info.ptrVal = I.getArgOperand(0);
3561 Info.offset = 0;
3563 Info.align = Align(4);
3564 return true;
3565 }
3566
3567 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3568 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3569 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3570 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3571 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3572 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3573 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3574 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3575 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3576 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3577 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3578 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3580 Info.memVT = MVT::v4f16;
3581 Info.ptrVal = I.getArgOperand(0);
3582 Info.offset = 0;
3584 Info.align = Align(16);
3585 return true;
3586 }
3587
3588 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3589 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3590 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3591 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3592 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3593 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3594 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3595 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3596 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3597 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3598 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3599 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3600 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3601 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3602 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3603 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3605 Info.memVT = MVT::v8f32;
3606 Info.ptrVal = I.getArgOperand(0);
3607 Info.offset = 0;
3609 Info.align = Align(16);
3610 return true;
3611 }
3612
3613 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3614 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3615 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3616 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3617
3618 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3619 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3620 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3621 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3622
3623 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3624 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3625 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3626 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3627 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3628 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3629 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3630 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3631 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3632 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3633 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3634 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3636 Info.memVT = MVT::v8i32;
3637 Info.ptrVal = I.getArgOperand(0);
3638 Info.offset = 0;
3640 Info.align = Align(16);
3641 return true;
3642 }
3643
3644 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3645 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3646 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3647 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3648 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3649 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3650 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3651 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3652 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3653 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
3655 Info.memVT = MVT::v2i32;
3656 Info.ptrVal = I.getArgOperand(0);
3657 Info.offset = 0;
3659 Info.align = Align(8);
3660 return true;
3661 }
3662
3663 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3664 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3665 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3666 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3667
3668 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3669 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3670 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3671 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3673 Info.memVT = MVT::f64;
3674 Info.ptrVal = I.getArgOperand(0);
3675 Info.offset = 0;
3677 Info.align = Align(8);
3678 return true;
3679 }
3680
3681 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3682 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3683 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3684 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3686 Info.memVT = MVT::v2f64;
3687 Info.ptrVal = I.getArgOperand(0);
3688 Info.offset = 0;
3690 Info.align = Align(16);
3691 return true;
3692 }
3693
3694 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3695 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3696 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3697 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3698 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3699 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3700 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3701 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3702 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3703 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3704 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3705 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3707 Info.memVT = MVT::v4f16;
3708 Info.ptrVal = I.getArgOperand(0);
3709 Info.offset = 0;
3711 Info.align = Align(16);
3712 return true;
3713 }
3714
3715 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3716 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3717 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3718 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3719 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3720 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3721 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3722 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3723 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3724 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3725 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3726 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3727 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3728 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3729 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3730 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3732 Info.memVT = MVT::v8f32;
3733 Info.ptrVal = I.getArgOperand(0);
3734 Info.offset = 0;
3736 Info.align = Align(16);
3737 return true;
3738 }
3739
3740 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3741 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3742 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3743 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3744 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3745 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3746 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3747 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3748 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3749 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3750 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3751 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3753 Info.memVT = MVT::v8i32;
3754 Info.ptrVal = I.getArgOperand(0);
3755 Info.offset = 0;
3757 Info.align = Align(16);
3758 return true;
3759 }
3760
3761 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3762 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3763 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3764 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3765 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3766 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3767 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3768 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3770 Info.memVT = MVT::v2i32;
3771 Info.ptrVal = I.getArgOperand(0);
3772 Info.offset = 0;
3774 Info.align = Align(8);
3775 return true;
3776 }
3777
3778 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
3779 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
3780 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
3781 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
3783 Info.memVT = MVT::v2f64;
3784 Info.ptrVal = I.getArgOperand(0);
3785 Info.offset = 0;
3787 Info.align = Align(16);
3788 return true;
3789 }
3790
3791 case Intrinsic::nvvm_atomic_load_inc_32:
3792 case Intrinsic::nvvm_atomic_load_dec_32:
3793
3794 case Intrinsic::nvvm_atomic_add_gen_f_cta:
3795 case Intrinsic::nvvm_atomic_add_gen_f_sys:
3796 case Intrinsic::nvvm_atomic_add_gen_i_cta:
3797 case Intrinsic::nvvm_atomic_add_gen_i_sys:
3798 case Intrinsic::nvvm_atomic_and_gen_i_cta:
3799 case Intrinsic::nvvm_atomic_and_gen_i_sys:
3800 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3801 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3802 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3803 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3804 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3805 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3806 case Intrinsic::nvvm_atomic_max_gen_i_cta:
3807 case Intrinsic::nvvm_atomic_max_gen_i_sys:
3808 case Intrinsic::nvvm_atomic_min_gen_i_cta:
3809 case Intrinsic::nvvm_atomic_min_gen_i_sys:
3810 case Intrinsic::nvvm_atomic_or_gen_i_cta:
3811 case Intrinsic::nvvm_atomic_or_gen_i_sys:
3812 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3813 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3814 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3815 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3816 auto &DL = I.getDataLayout();
3818 Info.memVT = getValueType(DL, I.getType());
3819 Info.ptrVal = I.getArgOperand(0);
3820 Info.offset = 0;
3822 Info.align.reset();
3823 return true;
3824 }
3825
3826 case Intrinsic::nvvm_ldu_global_i:
3827 case Intrinsic::nvvm_ldu_global_f:
3828 case Intrinsic::nvvm_ldu_global_p: {
3829 auto &DL = I.getDataLayout();
3831 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3832 Info.memVT = getValueType(DL, I.getType());
3833 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3834 Info.memVT = getPointerTy(DL);
3835 else
3836 Info.memVT = getValueType(DL, I.getType());
3837 Info.ptrVal = I.getArgOperand(0);
3838 Info.offset = 0;
3840 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
3841
3842 return true;
3843 }
3844 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3845 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3846 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3847 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3848 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3849 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3850 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3851 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3852 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3853 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3854 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3855 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3856 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3857 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3858 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3859 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3860 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3861 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3862 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3863 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3864 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3865 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3866 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3867 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3868 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3869 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3870 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3871 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3872 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3873 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3874 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3875 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3876 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3877 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3878 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3879 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3880 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3881 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3882 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3883 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3884 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3885 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3886 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3887 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3888 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3889 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3890 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3891 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3892 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3893 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3894 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3895 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3896 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3897 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3898 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3899 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3900 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3901 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3903 Info.memVT = MVT::v4f32;
3904 Info.ptrVal = nullptr;
3905 Info.offset = 0;
3907 Info.align = Align(16);
3908 return true;
3909
3910 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3911 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3912 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3913 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3914 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3915 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3916 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3917 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3918 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3919 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3920 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3921 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3922 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3923 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3924 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3925 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3926 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3927 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3928 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3929 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3930 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3931 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3932 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3933 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3934 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3935 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3936 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3937 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3938 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3939 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3940 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3941 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3942 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3943 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3944 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3945 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3946 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3947 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3948 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3949 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3950 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3951 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3952 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3953 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3954 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3955 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3956 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3957 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3958 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3959 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3960 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3961 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3962 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3963 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3964 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3965 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3966 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3967 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3968 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3969 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3970 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3971 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3972 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3973 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3974 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3975 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3976 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3977 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3978 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3979 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3980 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3981 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3982 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3983 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3984 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3985 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3986 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3987 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3988 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3989 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3990 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3991 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3992 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3993 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3994 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3995 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3996 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3997 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3998 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3999 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4000 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4001 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4002 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4003 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4004 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4005 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4006 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4007 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4008 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4009 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4010 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4011 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4012 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4013 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4014 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4015 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4016 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4017 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4018 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4019 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4020 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4021 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4022 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4023 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4024 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4025 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4027 Info.memVT = MVT::v4i32;
4028 Info.ptrVal = nullptr;
4029 Info.offset = 0;
4031 Info.align = Align(16);
4032 return true;
4033
4034 case Intrinsic::nvvm_suld_1d_i8_clamp:
4035 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4036 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4037 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4038 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4039 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4040 case Intrinsic::nvvm_suld_2d_i8_clamp:
4041 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4042 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4043 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4044 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4045 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4046 case Intrinsic::nvvm_suld_3d_i8_clamp:
4047 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4048 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4049 case Intrinsic::nvvm_suld_1d_i8_trap:
4050 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4051 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4052 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4053 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4054 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4055 case Intrinsic::nvvm_suld_2d_i8_trap:
4056 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4057 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4058 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4059 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4060 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4061 case Intrinsic::nvvm_suld_3d_i8_trap:
4062 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4063 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4064 case Intrinsic::nvvm_suld_1d_i8_zero:
4065 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4066 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4067 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4068 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4069 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4070 case Intrinsic::nvvm_suld_2d_i8_zero:
4071 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4072 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4073 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4074 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4075 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4076 case Intrinsic::nvvm_suld_3d_i8_zero:
4077 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4078 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4080 Info.memVT = MVT::i8;
4081 Info.ptrVal = nullptr;
4082 Info.offset = 0;
4084 Info.align = Align(16);
4085 return true;
4086
4087 case Intrinsic::nvvm_suld_1d_i16_clamp:
4088 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4089 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4090 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4091 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4092 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4093 case Intrinsic::nvvm_suld_2d_i16_clamp:
4094 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4095 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4096 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4097 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4098 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4099 case Intrinsic::nvvm_suld_3d_i16_clamp:
4100 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4101 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4102 case Intrinsic::nvvm_suld_1d_i16_trap:
4103 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4104 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4105 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4106 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4107 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4108 case Intrinsic::nvvm_suld_2d_i16_trap:
4109 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4110 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4111 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4112 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4113 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4114 case Intrinsic::nvvm_suld_3d_i16_trap:
4115 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4116 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4117 case Intrinsic::nvvm_suld_1d_i16_zero:
4118 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4119 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4120 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4121 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4122 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4123 case Intrinsic::nvvm_suld_2d_i16_zero:
4124 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4125 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4126 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4127 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4128 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4129 case Intrinsic::nvvm_suld_3d_i16_zero:
4130 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4131 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4133 Info.memVT = MVT::i16;
4134 Info.ptrVal = nullptr;
4135 Info.offset = 0;
4137 Info.align = Align(16);
4138 return true;
4139
4140 case Intrinsic::nvvm_suld_1d_i32_clamp:
4141 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4142 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4143 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4144 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4145 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4146 case Intrinsic::nvvm_suld_2d_i32_clamp:
4147 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4148 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4149 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4150 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4151 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4152 case Intrinsic::nvvm_suld_3d_i32_clamp:
4153 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4154 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4155 case Intrinsic::nvvm_suld_1d_i32_trap:
4156 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4157 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4158 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4159 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4160 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4161 case Intrinsic::nvvm_suld_2d_i32_trap:
4162 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4163 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4164 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4165 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4166 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4167 case Intrinsic::nvvm_suld_3d_i32_trap:
4168 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4169 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4170 case Intrinsic::nvvm_suld_1d_i32_zero:
4171 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4172 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4173 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4174 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4175 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4176 case Intrinsic::nvvm_suld_2d_i32_zero:
4177 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4178 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4179 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4180 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4181 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4182 case Intrinsic::nvvm_suld_3d_i32_zero:
4183 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4184 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4186 Info.memVT = MVT::i32;
4187 Info.ptrVal = nullptr;
4188 Info.offset = 0;
4190 Info.align = Align(16);
4191 return true;
4192
4193 case Intrinsic::nvvm_suld_1d_i64_clamp:
4194 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4195 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4196 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4197 case Intrinsic::nvvm_suld_2d_i64_clamp:
4198 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4199 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4200 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4201 case Intrinsic::nvvm_suld_3d_i64_clamp:
4202 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4203 case Intrinsic::nvvm_suld_1d_i64_trap:
4204 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4205 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4206 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4207 case Intrinsic::nvvm_suld_2d_i64_trap:
4208 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4209 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4210 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4211 case Intrinsic::nvvm_suld_3d_i64_trap:
4212 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4213 case Intrinsic::nvvm_suld_1d_i64_zero:
4214 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4215 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4216 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4217 case Intrinsic::nvvm_suld_2d_i64_zero:
4218 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4219 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4220 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4221 case Intrinsic::nvvm_suld_3d_i64_zero:
4222 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4224 Info.memVT = MVT::i64;
4225 Info.ptrVal = nullptr;
4226 Info.offset = 0;
4228 Info.align = Align(16);
4229 return true;
4230 }
4231 return false;
4232}
4233
4234/// getFunctionParamOptimizedAlign - since function arguments are passed via
4235/// .param space, we may want to increase their alignment in a way that
4236/// ensures that we can effectively vectorize their loads & stores. We can
4237/// increase alignment only if the function has internal or has private
4238/// linkage as for other linkage types callers may already rely on default
4239/// alignment. To allow using 128-bit vectorized loads/stores, this function
4240/// ensures that alignment is 16 or greater.
4242 const Function *F, Type *ArgTy, const DataLayout &DL) const {
4243 // Capping the alignment to 128 bytes as that is the maximum alignment
4244 // supported by PTX.
4245 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
4246
4247 // If a function has linkage different from internal or private, we
4248 // must use default ABI alignment as external users rely on it. Same
4249 // for a function that may be called from a function pointer.
4250 if (!F || !F->hasLocalLinkage() ||
4251 F->hasAddressTaken(/*Users=*/nullptr,
4252 /*IgnoreCallbackUses=*/false,
4253 /*IgnoreAssumeLikeCalls=*/true,
4254 /*IgnoreLLVMUsed=*/true))
4255 return ABITypeAlign;
4256
4257 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4258 return std::max(Align(16), ABITypeAlign);
4259}
4260
4261/// Helper for computing alignment of a device function byval parameter.
4263 const Function *F, Type *ArgTy, Align InitialAlign,
4264 const DataLayout &DL) const {
4265 Align ArgAlign = InitialAlign;
4266 // Try to increase alignment to enhance vectorization options.
4267 if (F)
4268 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4269
4270 // Old ptx versions have a bug. When PTX code takes address of
4271 // byval parameter with alignment < 4, ptxas generates code to
4272 // spill argument into memory. Alas on sm_50+ ptxas generates
4273 // SASS code that fails with misaligned access. To work around
4274 // the problem, make sure that we align byval parameters by at
4275 // least 4. This bug seems to be fixed at least starting from
4276 // ptxas > 9.0.
4277 // TODO: remove this after verifying the bug is not reproduced
4278 // on non-deprecated ptxas versions.
4280 ArgAlign = std::max(ArgAlign, Align(4));
4281
4282 return ArgAlign;
4283}
4284
4285// Helper for getting a function parameter name. Name is composed from
4286// its index and the function name. Negative index corresponds to special
4287// parameter (unsized array) used for passing variable arguments.
4289 int Idx) const {
4290 std::string ParamName;
4291 raw_string_ostream ParamStr(ParamName);
4292
4293 ParamStr << getTargetMachine().getSymbol(F)->getName();
4294 if (Idx < 0)
4295 ParamStr << "_vararg";
4296 else
4297 ParamStr << "_param_" << Idx;
4298
4299 return ParamName;
4300}
4301
4302/// isLegalAddressingMode - Return true if the addressing mode represented
4303/// by AM is legal for this target, for a load/store of the specified type.
4304/// Used to guide target specific optimizations, like loop strength reduction
4305/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4306/// (CodeGenPrepare.cpp)
4308 const AddrMode &AM, Type *Ty,
4309 unsigned AS, Instruction *I) const {
4310 // AddrMode - This represents an addressing mode of:
4311 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4312 //
4313 // The legal address modes are
4314 // - [avar]
4315 // - [areg]
4316 // - [areg+immoff]
4317 // - [immAddr]
4318
4319 // immoff must fit in a signed 32-bit int
4320 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
4321 return false;
4322
4323 if (AM.BaseGV)
4324 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4325
4326 switch (AM.Scale) {
4327 case 0: // "r", "r+i" or "i" is allowed
4328 break;
4329 case 1:
4330 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4331 return false;
4332 // Otherwise we have r+i.
4333 break;
4334 default:
4335 // No scale > 1 is allowed
4336 return false;
4337 }
4338 return true;
4339}
4340
4341//===----------------------------------------------------------------------===//
4342// NVPTX Inline Assembly Support
4343//===----------------------------------------------------------------------===//
4344
4345/// getConstraintType - Given a constraint letter, return the type of
4346/// constraint it is for this target.
4349 if (Constraint.size() == 1) {
4350 switch (Constraint[0]) {
4351 default:
4352 break;
4353 case 'b':
4354 case 'r':
4355 case 'h':
4356 case 'c':
4357 case 'l':
4358 case 'f':
4359 case 'd':
4360 case 'q':
4361 case '0':
4362 case 'N':
4363 return C_RegisterClass;
4364 }
4365 }
4366 return TargetLowering::getConstraintType(Constraint);
4367}
4368
4369std::pair<unsigned, const TargetRegisterClass *>
4371 StringRef Constraint,
4372 MVT VT) const {
4373 if (Constraint.size() == 1) {
4374 switch (Constraint[0]) {
4375 case 'b':
4376 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4377 case 'c':
4378 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4379 case 'h':
4380 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4381 case 'r':
4382 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4383 case 'l':
4384 case 'N':
4385 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4386 case 'q': {
4387 if (STI.getSmVersion() < 70)
4388 report_fatal_error("Inline asm with 128 bit operands is only "
4389 "supported for sm_70 and higher!");
4390 return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
4391 }
4392 case 'f':
4393 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4394 case 'd':
4395 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4396 }
4397 }
4398 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4399}
4400
4401//===----------------------------------------------------------------------===//
4402// NVPTX DAG Combining
4403//===----------------------------------------------------------------------===//
4404
4406 CodeGenOptLevel OptLevel) const {
4407 // Always honor command-line argument
4408 if (FMAContractLevelOpt.getNumOccurrences() > 0)
4409 return FMAContractLevelOpt > 0;
4410
4411 // Do not contract if we're not optimizing the code.
4412 if (OptLevel == CodeGenOptLevel::None)
4413 return false;
4414
4415 // Honor TargetOptions flags that explicitly say fusion is okay.
4417 return true;
4418
4419 return allowUnsafeFPMath(MF);
4420}
4421
4423 // Honor TargetOptions flags that explicitly say unsafe math is okay.
4425 return true;
4426
4427 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4428 const Function &F = MF.getFunction();
4429 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
4430}
4431
4432static bool isConstZero(const SDValue &Operand) {
4433 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
4434 return Const && Const->getZExtValue() == 0;
4435}
4436
4437/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4438/// operands N0 and N1. This is a helper for PerformADDCombine that is
4439/// called with the default operands, and if that fails, with commuted
4440/// operands.
4441static SDValue
4444 EVT VT = N0.getValueType();
4445
4446 // Since integer multiply-add costs the same as integer multiply
4447 // but is more costly than integer add, do the fusion only when
4448 // the mul is only used in the add.
4449 // TODO: this may not be true for later architectures, consider relaxing this
4450 if (!N0.getNode()->hasOneUse())
4451 return SDValue();
4452
4453 // fold (add (select cond, 0, (mul a, b)), c)
4454 // -> (select cond, c, (add (mul a, b), c))
4455 //
4456 if (N0.getOpcode() == ISD::SELECT) {
4457 unsigned ZeroOpNum;
4458 if (isConstZero(N0->getOperand(1)))
4459 ZeroOpNum = 1;
4460 else if (isConstZero(N0->getOperand(2)))
4461 ZeroOpNum = 2;
4462 else
4463 return SDValue();
4464
4465 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
4466 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
4467 return SDValue();
4468
4469 SDLoc DL(N);
4470 SDValue Mul =
4471 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
4472 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
4473 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
4474 ((ZeroOpNum == 1) ? N1 : MAD),
4475 ((ZeroOpNum == 1) ? MAD : N1));
4476 }
4477
4478 return SDValue();
4479}
4480
4481static SDValue
4484 CodeGenOptLevel OptLevel) {
4485 EVT VT = N0.getValueType();
4486 if (N0.getOpcode() == ISD::FMUL) {
4487 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4488 &DCI.DAG.getTargetLoweringInfo());
4489 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
4490 return SDValue();
4491
4492 // For floating point:
4493 // Do the fusion only when the mul has less than 5 uses and all
4494 // are add.
4495 // The heuristic is that if a use is not an add, then that use
4496 // cannot be fused into fma, therefore mul is still needed anyway.
4497 // If there are more than 4 uses, even if they are all add, fusing
4498 // them will increase register pressue.
4499 //
4500 int numUses = 0;
4501 int nonAddCount = 0;
4502 for (const SDNode *User : N0.getNode()->users()) {
4503 numUses++;
4504 if (User->getOpcode() != ISD::FADD)
4505 ++nonAddCount;
4506 if (numUses >= 5)
4507 return SDValue();
4508 }
4509 if (nonAddCount) {
4510 int orderNo = N->getIROrder();
4511 int orderNo2 = N0.getNode()->getIROrder();
4512 // simple heuristics here for considering potential register
4513 // pressure, the logics here is that the differnce are used
4514 // to measure the distance between def and use, the longer distance
4515 // more likely cause register pressure.
4516 if (orderNo - orderNo2 < 500)
4517 return SDValue();
4518
4519 // Now, check if at least one of the FMUL's operands is live beyond the
4520 // node N, which guarantees that the FMA will not increase register
4521 // pressure at node N.
4522 bool opIsLive = false;
4523 const SDNode *left = N0.getOperand(0).getNode();
4524 const SDNode *right = N0.getOperand(1).getNode();
4525
4526 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4527 opIsLive = true;
4528
4529 if (!opIsLive)
4530 for (const SDNode *User : left->users()) {
4531 int orderNo3 = User->getIROrder();
4532 if (orderNo3 > orderNo) {
4533 opIsLive = true;
4534 break;
4535 }
4536 }
4537
4538 if (!opIsLive)
4539 for (const SDNode *User : right->users()) {
4540 int orderNo3 = User->getIROrder();
4541 if (orderNo3 > orderNo) {
4542 opIsLive = true;
4543 break;
4544 }
4545 }
4546
4547 if (!opIsLive)
4548 return SDValue();
4549 }
4550
4551 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
4552 N0.getOperand(1), N1);
4553 }
4554
4555 return SDValue();
4556}
4557
4558static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
4559 std::size_t Back) {
4560 if (all_of(N->ops().drop_front(Front).drop_back(Back),
4561 [](const SDUse &U) { return U.get()->isUndef(); }))
4562 // Operand 0 is the previous value in the chain. Cannot return EntryToken
4563 // as the previous value will become unused and eliminated later.
4564 return N->getOperand(0);
4565
4566 return SDValue();
4567}
4568
4570 // Operands from the 3rd to the 2nd last one are the values to be stored.
4571 // {Chain, ArgID, Offset, Val, Glue}
4572 return PerformStoreCombineHelper(N, 3, 1);
4573}
4574
4576 // Operands from the 2nd to the last one are the values to be stored
4577 return PerformStoreCombineHelper(N, 2, 0);
4578}
4579
4580/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4581///
4584 CodeGenOptLevel OptLevel) {
4585 if (OptLevel == CodeGenOptLevel::None)
4586 return SDValue();
4587
4588 SDValue N0 = N->getOperand(0);
4589 SDValue N1 = N->getOperand(1);
4590
4591 // Skip non-integer, non-scalar case
4592 EVT VT = N0.getValueType();
4593 if (VT.isVector() || VT != MVT::i32)
4594 return SDValue();
4595
4596 // First try with the default operand order.
4597 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
4598 return Result;
4599
4600 // If that didn't work, try again with the operands commuted.
4601 return PerformADDCombineWithOperands(N, N1, N0, DCI);
4602}
4603
4604/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
4605///
4608 CodeGenOptLevel OptLevel) {
4609 SDValue N0 = N->getOperand(0);
4610 SDValue N1 = N->getOperand(1);
4611
4612 EVT VT = N0.getValueType();
4613 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
4614 return SDValue();
4615
4616 // First try with the default operand order.
4617 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
4618 return Result;
4619
4620 // If that didn't work, try again with the operands commuted.
4621 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
4622}
4623
4626 // The type legalizer turns a vector load of i8 values into a zextload to i16
4627 // registers, optionally ANY_EXTENDs it (if target type is integer),
4628 // and ANDs off the high 8 bits. Since we turn this load into a
4629 // target-specific DAG node, the DAG combiner fails to eliminate these AND
4630 // nodes. Do that here.
4631 SDValue Val = N->getOperand(0);
4632 SDValue Mask = N->getOperand(1);
4633
4634 if (isa<ConstantSDNode>(Val)) {
4635 std::swap(Val, Mask);
4636 }
4637
4638 SDValue AExt;
4639
4640 // Convert BFE-> truncate i16 -> and 255
4641 // To just BFE-> truncate i16, as the value already has all the bits in the
4642 // right places.
4643 if (Val.getOpcode() == ISD::TRUNCATE) {
4644 SDValue BFE = Val.getOperand(0);
4645 if (BFE.getOpcode() != NVPTXISD::BFE)
4646 return SDValue();
4647
4648 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
4649 if (!BFEBits)
4650 return SDValue();
4651 uint64_t BFEBitsVal = BFEBits->getZExtValue();
4652
4653 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4654 if (!MaskCnst) {
4655 // Not an AND with a constant
4656 return SDValue();
4657 }
4658 uint64_t MaskVal = MaskCnst->getZExtValue();
4659
4660 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
4661 return SDValue();
4662 // If we get here, the AND is unnecessary. Just replace it with the trunc
4663 DCI.CombineTo(N, Val, false);
4664 }
4665 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4666 if (Val.getOpcode() == ISD::ANY_EXTEND) {
4667 AExt = Val;
4668 Val = Val->getOperand(0);
4669 }
4670
4671 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4672 Val->getOpcode() == NVPTXISD::LoadV4) {
4673 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4674 if (!MaskCnst) {
4675 // Not an AND with a constant
4676 return SDValue();
4677 }
4678
4679 uint64_t MaskVal = MaskCnst->getZExtValue();
4680 if (MaskVal != 0xff) {
4681 // Not an AND that chops off top 8 bits
4682 return SDValue();
4683 }
4684
4685 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4686 if (!Mem) {
4687 // Not a MemSDNode?!?
4688 return SDValue();
4689 }
4690
4691 EVT MemVT = Mem->getMemoryVT();
4692 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4693 // We only handle the i8 case
4694 return SDValue();
4695 }
4696
4697 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
4698 if (ExtType == ISD::SEXTLOAD) {
4699 // If for some reason the load is a sextload, the and is needed to zero
4700 // out the high 8 bits
4701 return SDValue();
4702 }
4703
4704 bool AddTo = false;
4705 if (AExt.getNode() != nullptr) {
4706 // Re-insert the ext as a zext.
4707 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4708 AExt.getValueType(), Val);
4709 AddTo = true;
4710 }
4711
4712 // If we get here, the AND is unnecessary. Just replace it with the load
4713 DCI.CombineTo(N, Val, AddTo);
4714 }
4715
4716 return SDValue();
4717}
4718
4721 CodeGenOptLevel OptLevel) {
4722 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4723
4724 // Don't do anything at less than -O2.
4725 if (OptLevel < CodeGenOptLevel::Default)
4726 return SDValue();
4727
4728 SelectionDAG &DAG = DCI.DAG;
4729 SDLoc DL(N);
4730 EVT VT = N->getValueType(0);
4731 bool IsSigned = N->getOpcode() == ISD::SREM;
4732 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4733
4734 const SDValue &Num = N->getOperand(0);
4735 const SDValue &Den = N->getOperand(1);
4736
4737 for (const SDNode *U : Num->users()) {
4738 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4739 U->getOperand(1) == Den) {
4740 // Num % Den -> Num - (Num / Den) * Den
4741 return DAG.getNode(ISD::SUB, DL, VT, Num,
4742 DAG.getNode(ISD::MUL, DL, VT,
4743 DAG.getNode(DivOpc, DL, VT, Num, Den),
4744 Den));
4745 }
4746 }
4747 return SDValue();
4748}
4749
4753 Unknown
4755
4756/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4757/// that can be demoted to \p OptSize bits without loss of information. The
4758/// signedness of the operand, if determinable, is placed in \p S.
4760 unsigned OptSize,
4761 OperandSignedness &S) {
4762 S = Unknown;
4763
4764 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4765 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
4766 EVT OrigVT = Op.getOperand(0).getValueType();
4767 if (OrigVT.getFixedSizeInBits() <= OptSize) {
4768 S = Signed;
4769 return true;
4770 }
4771 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4772 EVT OrigVT = Op.getOperand(0).getValueType();
4773 if (OrigVT.getFixedSizeInBits() <= OptSize) {
4774 S = Unsigned;
4775 return true;
4776 }
4777 }
4778
4779 return false;
4780}
4781
4782/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4783/// be demoted to \p OptSize bits without loss of information. If the operands
4784/// contain a constant, it should appear as the RHS operand. The signedness of
4785/// the operands is placed in \p IsSigned.
4787 unsigned OptSize,
4788 bool &IsSigned) {
4789 OperandSignedness LHSSign;
4790
4791 // The LHS operand must be a demotable op
4792 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4793 return false;
4794
4795 // We should have been able to determine the signedness from the LHS
4796 if (LHSSign == Unknown)
4797 return false;
4798
4799 IsSigned = (LHSSign == Signed);
4800
4801 // The RHS can be a demotable op or a constant
4802 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4803 const APInt &Val = CI->getAPIntValue();
4804 if (LHSSign == Unsigned) {
4805 return Val.isIntN(OptSize);
4806 } else {
4807 return Val.isSignedIntN(OptSize);
4808 }
4809 } else {
4810 OperandSignedness RHSSign;
4811 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4812 return false;
4813
4814 return LHSSign == RHSSign;
4815 }
4816}
4817
4818/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4819/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4820/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4821/// amount.
4824 EVT MulType = N->getValueType(0);
4825 if (MulType != MVT::i32 && MulType != MVT::i64) {
4826 return SDValue();
4827 }
4828
4829 SDLoc DL(N);
4830 unsigned OptSize = MulType.getSizeInBits() >> 1;
4831 SDValue LHS = N->getOperand(0);
4832 SDValue RHS = N->getOperand(1);
4833
4834 // Canonicalize the multiply so the constant (if any) is on the right
4835 if (N->getOpcode() == ISD::MUL) {
4836 if (isa<ConstantSDNode>(LHS)) {
4837 std::swap(LHS, RHS);
4838 }
4839 }
4840
4841 // If we have a SHL, determine the actual multiply amount
4842 if (N->getOpcode() == ISD::SHL) {
4843 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4844 if (!ShlRHS) {
4845 return SDValue();
4846 }
4847
4848 APInt ShiftAmt = ShlRHS->getAPIntValue();
4849 unsigned BitWidth = MulType.getSizeInBits();
4850 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4851 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4852 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4853 } else {
4854 return SDValue();
4855 }
4856 }
4857
4858 bool Signed;
4859 // Verify that our operands are demotable
4860 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4861 return SDValue();
4862 }
4863
4864 EVT DemotedVT;
4865 if (MulType == MVT::i32) {
4866 DemotedVT = MVT::i16;
4867 } else {
4868 DemotedVT = MVT::i32;
4869 }
4870
4871 // Truncate the operands to the correct size. Note that these are just for
4872 // type consistency and will (likely) be eliminated in later phases.
4873 SDValue TruncLHS =
4874 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4875 SDValue TruncRHS =
4876 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4877
4878 unsigned Opc;
4879 if (Signed) {
4881 } else {
4883 }
4884
4885 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4886}
4887
4888static bool isConstOne(const SDValue &Operand) {
4889 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
4890 return Const && Const->getZExtValue() == 1;
4891}
4892
4894 if (Add->getOpcode() != ISD::ADD)
4895 return SDValue();
4896
4897 if (isConstOne(Add->getOperand(0)))
4898 return Add->getOperand(1);
4899
4900 if (isConstOne(Add->getOperand(1)))
4901 return Add->getOperand(0);
4902
4903 return SDValue();
4904}
4905
4908
4910 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
4911 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
4912 }
4913
4914 return SDValue();
4915}
4916
4918 SDLoc DL,
4920 if (Select->getOpcode() != ISD::SELECT)
4921 return SDValue();
4922
4923 SDValue Cond = Select->getOperand(0);
4924
4925 unsigned ConstOpNo;
4926 if (isConstOne(Select->getOperand(1)))
4927 ConstOpNo = 1;
4928 else if (isConstOne(Select->getOperand(2)))
4929 ConstOpNo = 2;
4930 else
4931 return SDValue();
4932
4933 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
4934
4935 // Do not combine if the resulting sequence is not obviously profitable.
4937 return SDValue();
4938
4939 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
4940
4941 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
4942 (ConstOpNo == 1) ? X : NewMul,
4943 (ConstOpNo == 1) ? NewMul : X);
4944}
4945
4946static SDValue
4949
4950 EVT VT = N0.getValueType();
4951 if (VT.isVector())
4952 return SDValue();
4953
4954 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
4955 return SDValue();
4956
4957 SDLoc DL(N);
4958
4959 // (mul x, (add y, 1)) -> (add (mul x, y), x)
4960 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
4961 return Res;
4962 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
4963 return Res;
4964
4965 // (mul x, (select y, 1)) -> (select (mul x, y), x)
4966 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
4967 return Res;
4968 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
4969 return Res;
4970
4971 return SDValue();
4972}
4973
4974/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4977 CodeGenOptLevel OptLevel) {
4978 if (OptLevel == CodeGenOptLevel::None)
4979 return SDValue();
4980
4981 if (SDValue Ret = TryMULWIDECombine(N, DCI))
4982 return Ret;
4983
4984 SDValue N0 = N->getOperand(0);
4985 SDValue N1 = N->getOperand(1);
4986 return PerformMULCombineWithOperands(N, N0, N1, DCI);
4987}
4988
4989/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4992 CodeGenOptLevel OptLevel) {
4993 if (OptLevel > CodeGenOptLevel::None) {
4994 // Try mul.wide combining at OptLevel > 0
4995 if (SDValue Ret = TryMULWIDECombine(N, DCI))
4996 return Ret;
4997 }
4998
4999 return SDValue();
5000}
5001
5004 unsigned int SmVersion) {
5005 EVT CCType = N->getValueType(0);
5006 SDValue A = N->getOperand(0);
5007 SDValue B = N->getOperand(1);
5008
5009 EVT AType = A.getValueType();
5010 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5011 return SDValue();
5012
5013 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5014 return SDValue();
5015
5016 SDLoc DL(N);
5017 // setp.f16x2 returns two scalar predicates, which we need to
5018 // convert back to v2i1. The returned result will be scalarized by
5019 // the legalizer, but the comparison will remain a single vector
5020 // instruction.
5021 SDValue CCNode = DCI.DAG.getNode(
5022 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5024 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5025 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5026 CCNode.getValue(1));
5027}
5028
5031 SDValue Vector = N->getOperand(0);
5032 if (Vector->getOpcode() == ISD::FREEZE)
5033 Vector = Vector->getOperand(0);
5034 SDLoc DL(N);
5035 EVT VectorVT = Vector.getValueType();
5036 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5037 IsPTXVectorType(VectorVT.getSimpleVT()))
5038 return SDValue(); // Native vector loads already combine nicely w/
5039 // extract_vector_elt.
5040 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5041 // handle them OK.
5042 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5043 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5044 return SDValue();
5045
5046 // Don't mess with undef values as sra may be simplified to 0, not undef.
5047 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5048 return SDValue();
5049
5050 uint64_t VectorBits = VectorVT.getSizeInBits();
5051 // We only handle the types we can extract in-register.
5052 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5053 return SDValue();
5054
5055 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5056 // Index == 0 is handled by generic DAG combiner.
5057 if (!Index || Index->getZExtValue() == 0)
5058 return SDValue();
5059
5060 MVT IVT = MVT::getIntegerVT(VectorBits);
5061 EVT EltVT = VectorVT.getVectorElementType();
5062 EVT EltIVT = EltVT.changeTypeToInteger();
5063 uint64_t EltBits = EltVT.getScalarSizeInBits();
5064
5065 SDValue Result = DCI.DAG.getNode(
5066 ISD::TRUNCATE, DL, EltIVT,
5067 DCI.DAG.getNode(
5068 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5069 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5070
5071 // If element has non-integer type, bitcast it back to the expected type.
5072 if (EltVT != EltIVT)
5073 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5074 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5075 if (EltVT != N->getValueType(0))
5076 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5077
5078 return Result;
5079}
5080
5083 SDValue VA = N->getOperand(1);
5084 EVT VectorVT = VA.getValueType();
5085 if (VectorVT != MVT::v4i8)
5086 return SDValue();
5087
5088 // We need to split vselect into individual per-element operations Because we
5089 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5090 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5091 // to/from i16 normally used for i8 values.
5093 SDLoc DL(N);
5094 SDValue VCond = N->getOperand(0);
5095 SDValue VB = N->getOperand(2);
5096 for (int I = 0; I < 4; ++I) {
5097 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5098 DCI.DAG.getConstant(I, DL, MVT::i32));
5099 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5100 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5101 DCI.DAG.getConstant(I, DL, MVT::i32)),
5102 DL, MVT::i32);
5103 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5104 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5105 DCI.DAG.getConstant(I, DL, MVT::i32)),
5106 DL, MVT::i32);
5108 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5109 }
5110 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5111}
5112
5113static SDValue
5115 auto VT = N->getValueType(0);
5116 if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
5117 return SDValue();
5118
5119 auto Op0 = N->getOperand(0);
5120 auto Op1 = N->getOperand(1);
5121
5122 // Start out by assuming we want to take the lower 2 bytes of each i32
5123 // operand.
5124 uint64_t Op0Bytes = 0x10;
5125 uint64_t Op1Bytes = 0x54;
5126
5127 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
5128 {&Op1, &Op1Bytes}};
5129
5130 // Check that each operand is an i16, truncated from an i32 operand. We'll
5131 // select individual bytes from those original operands. Optionally, fold in a
5132 // shift right of that original operand.
5133 for (auto &[Op, OpBytes] : OpData) {
5134 // Eat up any bitcast
5135 if (Op->getOpcode() == ISD::BITCAST)
5136 *Op = Op->getOperand(0);
5137
5138 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
5139 Op->getOperand(0).getValueType() == MVT::i32))
5140 return SDValue();
5141
5142 // If the truncate has multiple uses, this optimization can increase
5143 // register pressure
5144 if (!Op->hasOneUse())
5145 return SDValue();
5146
5147 *Op = Op->getOperand(0);
5148
5149 // Optionally, fold in a shift-right of the original operand and let permute
5150 // pick the two higher bytes of the original value directly.
5151 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
5152 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
5153 // Shift the PRMT byte selector to pick upper bytes from each respective
5154 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
5155 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
5156 "PRMT selector values out of range");
5157 *OpBytes += 0x22;
5158 *Op = Op->getOperand(0);
5159 }
5160 }
5161 }
5162
5163 SDLoc DL(N);
5164 auto &DAG = DCI.DAG;
5165
5166 auto PRMT = DAG.getNode(
5167 NVPTXISD::PRMT, DL, MVT::v4i8,
5168 {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
5169 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
5170 return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
5171}
5172
5173SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5174 DAGCombinerInfo &DCI) const {
5176 switch (N->getOpcode()) {
5177 default: break;
5178 case ISD::ADD:
5179 return PerformADDCombine(N, DCI, OptLevel);
5180 case ISD::FADD:
5181 return PerformFADDCombine(N, DCI, OptLevel);
5182 case ISD::MUL:
5183 return PerformMULCombine(N, DCI, OptLevel);
5184 case ISD::SHL:
5185 return PerformSHLCombine(N, DCI, OptLevel);
5186 case ISD::AND:
5187 return PerformANDCombine(N, DCI);
5188 case ISD::UREM:
5189 case ISD::SREM:
5190 return PerformREMCombine(N, DCI, OptLevel);
5191 case ISD::SETCC:
5192 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5202 return PerformEXTRACTCombine(N, DCI);
5203 case ISD::VSELECT:
5204 return PerformVSELECTCombine(N, DCI);
5205 case ISD::BUILD_VECTOR:
5206 return PerformBUILD_VECTORCombine(N, DCI);
5207 }
5208 return SDValue();
5209}
5210
5213 // Handle bitcasting to v2i8 without hitting the default promotion
5214 // strategy which goes through stack memory.
5215 SDValue Op(Node, 0);
5216 EVT ToVT = Op->getValueType(0);
5217 if (ToVT != MVT::v2i8) {
5218 return;
5219 }
5220
5221 // Bitcast to i16 and unpack elements into a vector
5222 SDLoc DL(Node);
5223 SDValue AsInt = MaybeBitcast(DAG, DL, MVT::i16, Op->getOperand(0));
5224 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
5225 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
5226 SDValue Vec1 =
5227 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5228 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
5229 Results.push_back(
5230 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
5231}
5232
5233/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5236 EVT ResVT = N->getValueType(0);
5237 SDLoc DL(N);
5238
5239 assert(ResVT.isVector() && "Vector load must have vector type");
5240
5241 auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
5242 if (!NumEltsAndEltVT)
5243 return;
5244 auto [NumElts, EltVT] = NumEltsAndEltVT.value();
5245
5246 LoadSDNode *LD = cast<LoadSDNode>(N);
5247
5248 Align Alignment = LD->getAlign();
5249 auto &TD = DAG.getDataLayout();
5250 Align PrefAlign =
5251 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5252 if (Alignment < PrefAlign) {
5253 // This load is not sufficiently aligned, so bail out and let this vector
5254 // load be scalarized. Note that we may still be able to emit smaller
5255 // vector loads. For example, if we are loading a <4 x float> with an
5256 // alignment of 8, this check will fail but the legalizer will try again
5257 // with 2 x <2 x float>, which will succeed with an alignment of 8.
5258 return;
5259 }
5260
5261 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5262 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5263 // loaded type to i16 and propagate the "real" type as the memory type.
5264 bool NeedTrunc = false;
5265 if (EltVT.getSizeInBits() < 16) {
5266 EltVT = MVT::i16;
5267 NeedTrunc = true;
5268 }
5269
5270 unsigned Opcode = 0;
5271 SDVTList LdResVTs;
5272
5273 switch (NumElts) {
5274 default:
5275 return;
5276 case 2:
5277 Opcode = NVPTXISD::LoadV2;
5278 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5279 break;
5280 case 4: {
5281 Opcode = NVPTXISD::LoadV4;
5282 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5283 LdResVTs = DAG.getVTList(ListVTs);
5284 break;
5285 }
5286 }
5287
5288 // Copy regular operands
5289 SmallVector<SDValue, 8> OtherOps(N->ops());
5290
5291 // The select routine does not have access to the LoadSDNode instance, so
5292 // pass along the extension information
5293 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5294
5295 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5296 LD->getMemoryVT(),
5297 LD->getMemOperand());
5298
5299 SmallVector<SDValue> ScalarRes;
5300 assert(NumElts <= ResVT.getVectorNumElements() &&
5301 "NumElts should not increase, only decrease or stay the same.");
5302 if (NumElts < ResVT.getVectorNumElements()) {
5303 // If the number of elements has decreased, getVectorLoweringShape has
5304 // upsized the element types
5305 assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
5306 EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type.");
5307 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
5308 // into individual elements.
5309 for (unsigned i = 0; i < NumElts; ++i) {
5310 SDValue SubVector = NewLD.getValue(i);
5311 DAG.ExtractVectorElements(SubVector, ScalarRes);
5312 }
5313 } else {
5314 for (unsigned i = 0; i < NumElts; ++i) {
5315 SDValue Res = NewLD.getValue(i);
5316 if (NeedTrunc)
5317 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5318 ScalarRes.push_back(Res);
5319 }
5320 }
5321
5322 SDValue LoadChain = NewLD.getValue(NumElts);
5323
5324 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
5325
5326 Results.push_back(BuildVec);
5327 Results.push_back(LoadChain);
5328}
5329
5332 SDValue Chain = N->getOperand(0);
5333 SDValue Intrin = N->getOperand(1);
5334 SDLoc DL(N);
5335
5336 // Get the intrinsic ID
5337 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5338 switch (IntrinNo) {
5339 default:
5340 return;
5341 case Intrinsic::nvvm_ldu_global_i:
5342 case Intrinsic::nvvm_ldu_global_f:
5343 case Intrinsic::nvvm_ldu_global_p: {
5344 EVT ResVT = N->getValueType(0);
5345
5346 if (ResVT.isVector()) {
5347 // Vector LDG/LDU
5348
5349 unsigned NumElts = ResVT.getVectorNumElements();
5350 EVT EltVT = ResVT.getVectorElementType();
5351
5352 // Since LDU/LDG are target nodes, we cannot rely on DAG type
5353 // legalization.
5354 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5355 // loaded type to i16 and propagate the "real" type as the memory type.
5356 bool NeedTrunc = false;
5357 if (EltVT.getSizeInBits() < 16) {
5358 EltVT = MVT::i16;
5359 NeedTrunc = true;
5360 }
5361
5362 unsigned Opcode = 0;
5363 SDVTList LdResVTs;
5364
5365 switch (NumElts) {
5366 default:
5367 return;
5368 case 2:
5369 Opcode = NVPTXISD::LDUV2;
5370 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5371 break;
5372 case 4: {
5373 Opcode = NVPTXISD::LDUV4;
5374 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5375 LdResVTs = DAG.getVTList(ListVTs);
5376 break;
5377 }
5378 }
5379
5380 SmallVector<SDValue, 8> OtherOps;
5381
5382 // Copy regular operands
5383
5384 OtherOps.push_back(Chain); // Chain
5385 // Skip operand 1 (intrinsic ID)
5386 // Others
5387 OtherOps.append(N->op_begin() + 2, N->op_end());
5388
5389 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5390
5391 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5392 MemSD->getMemoryVT(),
5393 MemSD->getMemOperand());
5394
5395 SmallVector<SDValue, 4> ScalarRes;
5396
5397 for (unsigned i = 0; i < NumElts; ++i) {
5398 SDValue Res = NewLD.getValue(i);
5399 if (NeedTrunc)
5400 Res =
5401 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5402 ScalarRes.push_back(Res);
5403 }
5404
5405 SDValue LoadChain = NewLD.getValue(NumElts);
5406
5407 SDValue BuildVec =
5408 DAG.getBuildVector(ResVT, DL, ScalarRes);
5409
5410 Results.push_back(BuildVec);
5411 Results.push_back(LoadChain);
5412 } else {
5413 // i8 LDG/LDU
5414 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5415 "Custom handling of non-i8 ldu/ldg?");
5416
5417 // Just copy all operands as-is
5418 SmallVector<SDValue, 4> Ops(N->ops());
5419
5420 // Force output to i16
5421 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5422
5423 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5424
5425 // We make sure the memory type is i8, which will be used during isel
5426 // to select the proper instruction.
5427 SDValue NewLD =
5428 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5429 MVT::i8, MemSD->getMemOperand());
5430
5431 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5432 NewLD.getValue(0)));
5433 Results.push_back(NewLD.getValue(1));
5434 }
5435 }
5436 }
5437}
5438
5441 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
5442 // result so that it can pass the legalization
5443 SDLoc DL(N);
5444 SDValue Chain = N->getOperand(0);
5445 SDValue Reg = N->getOperand(1);
5446 SDValue Glue = N->getOperand(2);
5447
5448 assert(Reg.getValueType() == MVT::i128 &&
5449 "Custom lowering for CopyFromReg with 128-bit reg only");
5450 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
5451 N->getValueType(2)};
5452 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
5453
5454 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
5455 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
5456 {NewValue.getValue(0), NewValue.getValue(1)});
5457
5458 Results.push_back(Pair);
5459 Results.push_back(NewValue.getValue(2));
5460 Results.push_back(NewValue.getValue(3));
5461}
5462
5463void NVPTXTargetLowering::ReplaceNodeResults(
5465 switch (N->getOpcode()) {
5466 default:
5467 report_fatal_error("Unhandled custom legalization");
5468 case ISD::BITCAST:
5469 ReplaceBITCAST(N, DAG, Results);
5470 return;
5471 case ISD::LOAD:
5473 return;
5476 return;
5477 case ISD::CopyFromReg:
5479 return;
5480 }
5481}
5482
5485 Type *Ty = AI->getValOperand()->getType();
5486
5487 if (AI->isFloatingPointOperation()) {
5489 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
5490 STI.getPTXVersion() >= 63)
5492 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
5493 STI.getPTXVersion() >= 78)
5495 if (Ty->isFloatTy())
5497 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
5499 }
5501 }
5502
5503 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
5504 auto ITy = cast<llvm::IntegerType>(Ty);
5505
5506 switch (AI->getOperation()) {
5507 default:
5513 switch (ITy->getBitWidth()) {
5514 case 8:
5515 case 16:
5517 case 32:
5519 case 64:
5520 if (STI.hasAtomBitwise64())
5523 default:
5524 llvm_unreachable("unsupported width encountered");
5525 }
5532 switch (ITy->getBitWidth()) {
5533 case 8:
5534 case 16:
5536 case 32:
5538 case 64:
5539 if (STI.hasAtomMinMax64())
5542 default:
5543 llvm_unreachable("unsupported width encountered");
5544 }
5545 }
5546
5548}
5549
5550// Pin NVPTXTargetObjectFile's vtables to this file.
5552
5554 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
5555 return getDataSection();
5556}
#define MAKE_CASE(V)
static const LLT F32
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
static SDValue PerformStoreParamCombine(SDNode *N)
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static bool Is16bitsType(MVT VT)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static bool IsTypePassedAsArray(const Type *Ty)
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static std::optional< std::pair< unsigned int, EVT > > getVectorLoweringShape(EVT VectorVT)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformStoreRetvalCombine(SDNode *N)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)
static bool adjustElementType(EVT &ElementType)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue Value)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
ParamVectorizationFlags
@ PVF_FIRST
@ PVF_SCALAR
@ PVF_INNER
@ PVF_LAST
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
OperandSignedness
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
static std::atomic< unsigned > GlobalUniqueCallSite
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
unsigned SmVersion
Definition: NVVMReflect.cpp:79
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1130
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
bool isFloatingPointOperation() const
Definition: Instructions.h:882
BinOp getOperation() const
Definition: Instructions.h:805
Value * getValOperand()
Definition: Instructions.h:874
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Definition: Attributes.h:833
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
This class represents a function call, abstracting a target machine's calling convention.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:36
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
Machine Value Type.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
static auto fixedlen_vector_valuetypes()
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
bool hasHWROT32() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getPTXVersion() const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool hasBF16Math() const
bool allowFP16Math() const
bool hasAtomCas16() const
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool allowUnsafeFPMath(MachineFunction &MF) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition: SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:750
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:856
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:827
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
Class to represent struct types.
Definition: DerivedTypes.h:218
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
@ VoidTyID
type with no size
Definition: Type.h:63
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:303
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
StringRef save(const char *S)
Definition: StringSaver.h:52
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
int getNumOccurrences() const
Definition: CommandLine.h:399
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1226
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition: ISDOpcodes.h:366
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:223
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1217
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
@ Bitcast
Perform the operation on a different, but equivalently sized type.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
static bool isIndirectCall(const MachineInstr &MI)
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool Isv2x16VT(EVT VT)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
MaybeAlign getAlign(const Function &F, unsigned Index)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:396
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
unsigned promoteScalarArgumentSize(unsigned size)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:257
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)