LLVM 22.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
22#include "llvm/IR/GlobalValue.h"
24#include "llvm/IR/IntrinsicsNVPTX.h"
31#include <optional>
32
33using namespace llvm;
34
35#define DEBUG_TYPE "nvptx-isel"
36#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
37
38static cl::opt<bool>
39 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
40 cl::desc("Enable reciprocal sqrt optimization"));
41
42// FIXME: This is a WAR to recover lost performance from #155024.
43// We still need to investigate the regression and find a more permanent
44// solution.
45static cl::opt<bool> EnableMADWide("nvptx-mad-wide-opt", cl::init(false),
47 cl::desc("Enable MAD wide optimization"));
48
49/// createNVPTXISelDag - This pass converts a legalized DAG into a
50/// NVPTX-specific DAG, ready for instruction scheduling.
55
60
62
64
68
70 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
71 Scopes = NVPTXScopes(MF.getFunction().getContext());
73}
74
76NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
78}
79
80bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
82}
83
84bool NVPTXDAGToDAGISel::useF32FTZ() const {
85 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
86}
87
88bool NVPTXDAGToDAGISel::allowFMA() const {
89 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
90 return TL->allowFMA(*MF, OptLevel);
91}
92
93bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
94
95bool NVPTXDAGToDAGISel::doMADWideOpt() const { return EnableMADWide; }
96
97/// Select - Select instructions not customized! Used for
98/// expanded, promoted and normal instructions.
99void NVPTXDAGToDAGISel::Select(SDNode *N) {
100
101 if (N->isMachineOpcode()) {
102 N->setNodeId(-1);
103 return; // Already selected.
104 }
105
106 switch (N->getOpcode()) {
107 case ISD::LOAD:
108 case ISD::ATOMIC_LOAD:
109 case NVPTXISD::MLoad:
110 if (tryLoad(N))
111 return;
112 break;
113 case ISD::STORE:
115 if (tryStore(N))
116 return;
117 break;
119 if (tryFence(N))
120 return;
121 break;
123 tryUNPACK_VECTOR(N);
124 return;
126 if (tryEXTRACT_VECTOR_ELEMENT(N))
127 return;
128 break;
130 SelectSETP_F16X2(N);
131 return;
133 SelectSETP_BF16X2(N);
134 return;
135 case NVPTXISD::LoadV2:
136 case NVPTXISD::LoadV4:
137 case NVPTXISD::LoadV8:
138 if (tryLoadVector(N))
139 return;
140 break;
141 case NVPTXISD::LDUV2:
142 case NVPTXISD::LDUV4:
143 if (tryLDU(N))
144 return;
145 break;
149 if (tryStoreVector(N))
150 return;
151 break;
153 if (tryIntrinsicChain(N))
154 return;
155 break;
157 if (tryIntrinsicVoid(N))
158 return;
159 break;
160 case ISD::AND:
161 case ISD::SRA:
162 case ISD::SRL:
163 // Try to select BFE
164 if (tryBFE(N))
165 return;
166 break;
168 SelectAddrSpaceCast(N);
169 return;
170 case ISD::CopyToReg: {
171 if (N->getOperand(1).getValueType() == MVT::i128) {
172 SelectV2I64toI128(N);
173 return;
174 }
175 break;
176 }
177 case ISD::CopyFromReg: {
178 if (N->getOperand(1).getValueType() == MVT::i128) {
179 SelectI128toV2I64(N);
180 return;
181 }
182 break;
183 }
186 selectAtomicSwap128(N);
187 return;
188 case ISD::FADD:
189 case ISD::FMUL:
190 case ISD::FSUB:
191 if (tryBF16ArithToFMA(N))
192 return;
193 break;
194 case ISD::BR_JT:
195 return selectBR_JT(N);
196 default:
197 break;
198 }
199 SelectCode(N);
200}
201
202#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
203 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
204 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
205
206static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
207 switch (IID) {
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
209 return TCGEN05_LD_OPCODE(16x64b, x1);
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
211 return TCGEN05_LD_OPCODE(16x64b, x2);
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
213 return TCGEN05_LD_OPCODE(16x64b, x4);
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
215 return TCGEN05_LD_OPCODE(16x64b, x8);
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
217 return TCGEN05_LD_OPCODE(16x64b, x16);
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
219 return TCGEN05_LD_OPCODE(16x64b, x32);
220 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
221 return TCGEN05_LD_OPCODE(16x64b, x64);
222 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
223 return TCGEN05_LD_OPCODE(16x64b, x128);
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
225 return TCGEN05_LD_OPCODE(16x128b, x1);
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
227 return TCGEN05_LD_OPCODE(16x128b, x2);
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
229 return TCGEN05_LD_OPCODE(16x128b, x4);
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
231 return TCGEN05_LD_OPCODE(16x128b, x8);
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
233 return TCGEN05_LD_OPCODE(16x128b, x16);
234 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
235 return TCGEN05_LD_OPCODE(16x128b, x32);
236 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
237 return TCGEN05_LD_OPCODE(16x128b, x64);
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
239 return TCGEN05_LD_OPCODE(16x256b, x1);
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
241 return TCGEN05_LD_OPCODE(16x256b, x2);
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
243 return TCGEN05_LD_OPCODE(16x256b, x4);
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
245 return TCGEN05_LD_OPCODE(16x256b, x8);
246 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
247 return TCGEN05_LD_OPCODE(16x256b, x16);
248 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
249 return TCGEN05_LD_OPCODE(16x256b, x32);
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
251 return TCGEN05_LD_OPCODE(16x32bx2, x1);
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
253 return TCGEN05_LD_OPCODE(16x32bx2, x2);
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
255 return TCGEN05_LD_OPCODE(16x32bx2, x4);
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
257 return TCGEN05_LD_OPCODE(16x32bx2, x8);
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
259 return TCGEN05_LD_OPCODE(16x32bx2, x16);
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
261 return TCGEN05_LD_OPCODE(16x32bx2, x32);
262 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
263 return TCGEN05_LD_OPCODE(16x32bx2, x64);
264 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
265 return TCGEN05_LD_OPCODE(16x32bx2, x128);
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
267 return TCGEN05_LD_OPCODE(32x32b, x1);
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
269 return TCGEN05_LD_OPCODE(32x32b, x2);
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
271 return TCGEN05_LD_OPCODE(32x32b, x4);
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
273 return TCGEN05_LD_OPCODE(32x32b, x8);
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
275 return TCGEN05_LD_OPCODE(32x32b, x16);
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
277 return TCGEN05_LD_OPCODE(32x32b, x32);
278 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
279 return TCGEN05_LD_OPCODE(32x32b, x64);
280 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
281 return TCGEN05_LD_OPCODE(32x32b, x128);
282 }
283 llvm_unreachable("unhandled tcgen05.ld lowering");
284}
285
286void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
287 if (!Subtarget->hasTcgen05InstSupport())
289 "tcgen05.ld is not supported on this architecture variant");
290
291 SDLoc DL(N);
292 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
293
294 if (hasOffset) {
295 bool enablePack = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
296 auto OffsetNode = CurDAG->getTargetConstant(
297 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL, MVT::i32);
298 ReplaceNode(N, CurDAG->getMachineNode(
299 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
300 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
301 } else {
302 bool enablePack = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
303 ReplaceNode(N, CurDAG->getMachineNode(
304 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
305 {N->getOperand(2), N->getOperand(0)}));
306 }
307}
308
309bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
310 unsigned IID = N->getConstantOperandVal(1);
311 switch (IID) {
312 default:
313 return false;
314 case Intrinsic::nvvm_ldu_global_f:
315 case Intrinsic::nvvm_ldu_global_i:
316 case Intrinsic::nvvm_ldu_global_p:
317 return tryLDU(N);
318
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
324 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
325 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
326 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
331 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
332 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
334 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
337 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
338 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
339 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
345 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
346 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
347 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
348 SelectTcgen05Ld(N);
349 return true;
350 }
351
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
357 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
358 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
359 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
360 SelectTcgen05Ld(N, /* hasOffset */ true);
361 return true;
362 }
363 }
364}
365
366// Map ISD:CONDCODE value to appropriate CmpMode expected by
367// NVPTXInstPrinter::printCmpMode()
368SDValue NVPTXDAGToDAGISel::getPTXCmpMode(const CondCodeSDNode &CondCode) {
370 const unsigned PTXCmpMode = [](ISD::CondCode CC) {
371 switch (CC) {
372 default:
373 llvm_unreachable("Unexpected condition code.");
374 case ISD::SETOEQ:
375 case ISD::SETEQ:
376 return CmpMode::EQ;
377 case ISD::SETOGT:
378 case ISD::SETGT:
379 return CmpMode::GT;
380 case ISD::SETOGE:
381 case ISD::SETGE:
382 return CmpMode::GE;
383 case ISD::SETOLT:
384 case ISD::SETLT:
385 return CmpMode::LT;
386 case ISD::SETOLE:
387 case ISD::SETLE:
388 return CmpMode::LE;
389 case ISD::SETONE:
390 case ISD::SETNE:
391 return CmpMode::NE;
392 case ISD::SETO:
393 return CmpMode::NUM;
394 case ISD::SETUO:
395 return CmpMode::NotANumber;
396 case ISD::SETUEQ:
397 return CmpMode::EQU;
398 case ISD::SETUGT:
399 return CmpMode::GTU;
400 case ISD::SETUGE:
401 return CmpMode::GEU;
402 case ISD::SETULT:
403 return CmpMode::LTU;
404 case ISD::SETULE:
405 return CmpMode::LEU;
406 case ISD::SETUNE:
407 return CmpMode::NEU;
408 }
409 }(CondCode.get());
410 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
411}
412
413bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
414 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
415 SDLoc DL(N);
416 SDNode *SetP = CurDAG->getMachineNode(
417 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1,
418 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
419 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
420 ReplaceNode(N, SetP);
421 return true;
422}
423
424bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
425 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
426 SDLoc DL(N);
427 SDNode *SetP = CurDAG->getMachineNode(
428 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1,
429 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
430 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
431 ReplaceNode(N, SetP);
432 return true;
433}
434
435bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
436 SDValue Vector = N->getOperand(0);
437 MVT EltVT = N->getSimpleValueType(0);
438
439 MachineSDNode *N2 =
440 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);
441
442 ReplaceNode(N, N2);
443 return true;
444}
445
446// Find all instances of extract_vector_elt that use this v2f16 vector
447// and coalesce them into a scattering move instruction.
448bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
449 SDValue Vector = N->getOperand(0);
450
451 MVT VT = Vector.getSimpleValueType();
452 if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
453 return false;
454
455 unsigned Opcode;
456 if (VT.is32BitVector())
457 Opcode = NVPTX::I32toV2I16;
458 else if (VT.is64BitVector())
459 Opcode = NVPTX::I64toV2I32;
460 else
461 llvm_unreachable("Unhandled packed type");
462
463 // Find and record all uses of this vector that extract element 0 or 1.
465 for (auto *U : Vector.getNode()->users()) {
466 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
467 continue;
468 if (U->getOperand(0) != Vector)
469 continue;
470 if (const ConstantSDNode *IdxConst =
471 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
472 if (IdxConst->getZExtValue() == 0)
473 E0.push_back(U);
474 else if (IdxConst->getZExtValue() == 1)
475 E1.push_back(U);
476 else
477 llvm_unreachable("Invalid vector index.");
478 }
479 }
480
481 // There's no point scattering f16x2 if we only ever access one
482 // element of it.
483 if (E0.empty() || E1.empty())
484 return false;
485
486 // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
487 // into EltTy,EltTy Split[EltTy]x2(V)
488 MVT EltVT = VT.getVectorElementType();
489 SDNode *ScatterOp =
490 CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector);
491 for (auto *Node : E0)
492 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
493 for (auto *Node : E1)
494 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
495
496 return true;
497}
498
499static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
500 switch (AS) {
515 default:
516 return std::nullopt;
517 }
518}
519
521 return convertAS(N->getMemOperand()->getAddrSpace())
523}
524
525NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
526 // No "sem" orderings for SM/PTX versions which do not support memory ordering
529 auto Ordering = N->getMergedOrdering();
530 switch (Ordering) {
544 }
545 llvm_unreachable("Invalid atomic ordering");
546}
547
548NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
549 // No "scope" modifier for SM/PTX versions which do not support scoped atomics
550 // Functionally, these atomics are at device scope
551 if (!Subtarget->hasAtomScope())
553 return Scopes[N->getSyncScopeID()];
554}
555
556namespace {
557
558struct OperationOrderings {
559 NVPTX::Ordering InstructionOrdering, FenceOrdering;
560 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
561 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
562 : InstructionOrdering(IO), FenceOrdering(FO) {}
563};
564
565static OperationOrderings
566getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
567 AtomicOrdering Ordering = N->getSuccessOrdering();
568 auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
569
570 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
571 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
572
573 // clang-format off
574
575 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
576 // Note: uses of Relaxed in the Atomic column of this table refer
577 // to LLVM AtomicOrdering::Monotonic.
578 //
579 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
580 // |---------|----------|--------------------|------------|------------------------------|
581 // | No | No | All | plain | .weak |
582 // | No | Yes | Generic,Shared, | .volatile | .volatile |
583 // | | | Global [0] | | |
584 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
585 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
586 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
587 // | | | Global [0] | | |
588 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
589 // | | | Global [0] | | |
590 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
591 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
592 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
593 // | | | | | or .volatile (PTX 8.1-) |
594 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
595 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
596 // | | | / Global [0] | | |
597
598 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
599 // by following the ABI proven sound in:
600 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
601 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
602 //
603 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
604 // |------------------------------------------------------|-------------------------------|
605 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
606 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
607 // |------------------------------------------------------|-------------------------------|
608 // | cuda::atomic_load | fence.sc.<scope>; |
609 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
610 // |------------------------------------------------------|-------------------------------|
611 // | cuda::atomic_store | fence.sc.<scope>; |
612 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
613 // |------------------------------------------------------|-------------------------------|
614 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
615 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
616
617 // clang-format on
618
619 // [0]: volatile and atomics are only supported on global or shared
620 // memory locations, accessed via generic/shared/global pointers.
621 // MMIO is only supported on global memory locations,
622 // accessed via generic/global pointers.
623 // TODO: Implement MMIO access via generic pointer to global.
624 // Currently implemented for global pointers only.
625
626 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
627 // PTX instructions fails to preserve their C++ side-effects.
628 //
629 // Example (https://github.com/llvm/llvm-project/issues/62057):
630 //
631 // void example() {
632 // std::atomic<bool> True = true;
633 // while (True.load(std::memory_order_relaxed));
634 // }
635 //
636 // A C++ program that calls "example" is well-defined: the infinite loop
637 // performs an atomic operation. By lowering volatile/atomics to
638 // "weak" memory operations, we are transforming the above into:
639 //
640 // void undefined_behavior() {
641 // bool True = true;
642 // while (True);
643 // }
644 //
645 // which exhibits undefined behavior in both C++ and PTX.
646 //
647 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
648 // behavior due to lack of Independent Forward Progress. Lowering these
649 // to weak memory operations in sm_60- is therefore fine.
650 //
651 // TODO: lower atomic and volatile operations to memory locations
652 // in local, const, and param to two PTX instructions in sm_70+:
653 // - the "weak" memory instruction we are currently lowering to, and
654 // - some other instruction that preserves the side-effect, e.g.,
655 // a dead dummy volatile load.
656 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
657 CodeAddrSpace == NVPTX::AddressSpace::Const ||
658 CodeAddrSpace == NVPTX::AddressSpace::Param) {
660 }
661
662 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
663 // supported on sm_60 and older; this includes volatile atomics.
664 if (!(Ordering == AtomicOrdering::NotAtomic ||
665 Ordering == AtomicOrdering::Unordered ||
666 Ordering == AtomicOrdering::Monotonic) &&
667 !HasMemoryOrdering) {
669 formatv("PTX does not support \"atomic\" for orderings different than"
670 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
671 "is: \"{}\".",
672 toIRString(Ordering)));
673 }
674
675 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
676 // the volatile semantics and preserve the atomic ones.
677
678 // PTX volatile and PTX atomics are not available for statespace that differ
679 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
680 // atomics is undefined if the generic address does not refer to a .global or
681 // .shared memory location.
682 bool AddrGenericOrGlobalOrShared =
683 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
684 CodeAddrSpace == NVPTX::AddressSpace::Global ||
685 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
686 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
687 if (!AddrGenericOrGlobalOrShared)
689
690 bool UseRelaxedMMIO =
691 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
692
693 switch (Ordering) {
695 return N->isVolatile() ? NVPTX::Ordering::Volatile
698 // We lower unordered in the exact same way as 'monotonic' to respect
699 // LLVM IR atomicity requirements.
701 if (N->isVolatile())
702 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
704 else
705 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
707 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
708 // Acquire.
710 if (!N->readMem())
712 formatv("PTX only supports Acquire Ordering on reads: {}",
713 N->getOperationName()));
716 if (!N->writeMem())
718 formatv("PTX only supports Release Ordering on writes: {}",
719 N->getOperationName()));
723 formatv("NVPTX does not support AcquireRelease Ordering on "
724 "read-modify-write "
725 "yet and PTX does not support it on loads or stores: {}",
726 N->getOperationName()));
727 }
729 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
730 // sequence including a "fence.sc.sco" and the memory instruction with an
731 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
732 // whether the memory operation is a read, write, or read-modify-write.
733 //
734 // This sets the ordering of the fence to SequentiallyConsistent, and
735 // sets the corresponding ordering for the instruction.
736 NVPTX::Ordering InstrOrder;
737 if (N->readMem())
738 InstrOrder = NVPTX::Ordering::Acquire;
739 else if (N->writeMem())
740 InstrOrder = NVPTX::Ordering::Release;
741 else
743 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
744 "read-modify-writes yet: {}",
745 N->getOperationName()));
746 return OperationOrderings(InstrOrder,
748 }
749 }
751 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
752 toIRString(Ordering)));
753}
754
755} // namespace
756
757NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
758 NVPTX::Ordering O) const {
759 switch (O) {
761 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
762 // NVPTX uses Thread scope as the scope of non-atomic operations.
765 // RelaxedMMIO operations are always system scope.
766 // If a RelaxedMMIO order was generated from an atomic volatile operation
767 // with a smaller thread scope, we bump it here to system scope.
774 auto S = Scopes[N->getSyncScopeID()];
775
776 // Atomic operations must have a scope greater than thread.
777 if (S == NVPTX::Scope::Thread)
779 formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
780
781 // If scope is cluster, clusters must be supported.
782 if (S == NVPTX::Scope::Cluster)
783 Subtarget->failIfClustersUnsupported("cluster scope");
784
785 // If operation is volatile, then its scope is system.
786 return N->isVolatile() ? NVPTX::Scope::System : S;
787 }
788 llvm_unreachable("unhandled ordering");
789}
790
791static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
792 NVPTX::AddressSpace CodeAddrSpace) {
793 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
794 // space.
795 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
796 N.isInvariant();
797}
798
799static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
800 NVPTXSubtarget const *T) {
801 if (S == NVPTX::Scope::Cluster)
802 T->failIfClustersUnsupported(".cluster scope fence");
803
804 // Fall back to .acq_rel if .acquire, .release is not supported.
805 if (!T->hasSplitAcquireAndReleaseFences() &&
808
809 switch (O) {
811 switch (S) {
813 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
814 : NVPTX::INT_MEMBAR_SYS;
816 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
817 : NVPTX::INT_MEMBAR_CTA;
819 return NVPTX::atomic_thread_fence_acquire_cluster;
821 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
822 : NVPTX::INT_MEMBAR_GL;
826 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
827 ScopeToString(S)));
828 }
829 break;
831 switch (S) {
833 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
834 : NVPTX::INT_MEMBAR_SYS;
836 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
837 : NVPTX::INT_MEMBAR_CTA;
839 return NVPTX::atomic_thread_fence_release_cluster;
841 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
842 : NVPTX::INT_MEMBAR_GL;
846 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
847 ScopeToString(S)));
848 }
849 break;
851 switch (S) {
853 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
854 : NVPTX::INT_MEMBAR_SYS;
856 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
857 : NVPTX::INT_MEMBAR_CTA;
859 return NVPTX::atomic_thread_fence_acq_rel_cluster;
861 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
862 : NVPTX::INT_MEMBAR_GL;
866 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
867 ScopeToString(S)));
868 }
869 break;
870 }
872 switch (S) {
874 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
875 : NVPTX::INT_MEMBAR_SYS;
877 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
878 : NVPTX::INT_MEMBAR_CTA;
880 return NVPTX::atomic_thread_fence_seq_cst_cluster;
882 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
883 : NVPTX::INT_MEMBAR_GL;
886 report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
887 ScopeToString(S)));
888 }
889 break;
890 }
896 formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
897 OrderingToString(O), ScopeToString(S)));
898 }
899 llvm_unreachable("unhandled ordering");
900}
901
902// Returns Memory Order and Scope of a memory instruction, and
903// inserts any fence before the instruction that's required to
904// implement its memory ordering.
905std::pair<NVPTX::Ordering, NVPTX::Scope>
906NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
907 MemSDNode *N) {
908 auto [InstructionOrdering, FenceOrdering] =
909 getOperationOrderings(N, Subtarget);
910 auto Scope = getOperationScope(N, InstructionOrdering);
911
912 // If a fence is required before the operation, insert it:
913 switch (NVPTX::Ordering(FenceOrdering)) {
915 break;
917 auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
918 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
919 break;
920 }
921 default:
923 formatv("Unexpected fence ordering: \"{}\".",
924 OrderingToString(NVPTX::Ordering(FenceOrdering))));
925 }
926 return {InstructionOrdering, Scope};
927}
928
929void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
930 SDValue Src = N->getOperand(0);
931 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
932 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
933 unsigned DstAddrSpace = CastN->getDestAddressSpace();
934 SDLoc DL(N);
935 assert(SrcAddrSpace != DstAddrSpace &&
936 "addrspacecast must be between different address spaces");
937
938 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
939 // Specific to generic
940
941 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
942 SDValue CvtNone =
943 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
944 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
945 Src, CvtNone);
946 Src = SDValue(Cvt, 0);
947 }
948
949 unsigned Opc;
950 switch (SrcAddrSpace) {
951 default: report_fatal_error("Bad address space in addrspacecast");
953 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
954 break;
956 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
957 break;
959 if (!TM.is64Bit())
961 "Shared cluster address space is only supported in 64-bit mode");
962 Opc = NVPTX::cvta_shared_cluster_64;
963 break;
965 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
966 break;
968 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
969 break;
971 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
972 break;
973 }
974 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
975 return;
976 } else {
977 // Generic to specific
978 if (SrcAddrSpace != 0)
979 report_fatal_error("Cannot cast between two non-generic address spaces");
980 unsigned Opc;
981 switch (DstAddrSpace) {
982 default: report_fatal_error("Bad address space in addrspacecast");
984 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
985 break;
987 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
988 break;
990 if (!TM.is64Bit())
992 "Shared cluster address space is only supported in 64-bit mode");
993 Opc = NVPTX::cvta_to_shared_cluster_64;
994 break;
996 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
997 break;
999 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
1000 break;
1002 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1003 break;
1004 }
1005
1006 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
1007 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1008 SDValue CvtNone =
1009 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
1010 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
1011 SDValue(CVTA, 0), CvtNone);
1012 }
1013
1014 ReplaceNode(N, CVTA);
1015 return;
1016 }
1017}
1018
1019// Helper function template to reduce amount of boilerplate code for
1020// opcode selection.
1021static std::optional<unsigned>
1022pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
1023 std::optional<unsigned> Opcode_i32,
1024 std::optional<unsigned> Opcode_i64) {
1025 switch (VT) {
1026 case MVT::f16:
1027 case MVT::i16:
1028 case MVT::bf16:
1029 return Opcode_i16;
1030 case MVT::v2f16:
1031 case MVT::v2bf16:
1032 case MVT::v2i16:
1033 case MVT::v4i8:
1034 case MVT::i32:
1035 case MVT::f32:
1036 return Opcode_i32;
1037 case MVT::v2f32:
1038 case MVT::v2i32:
1039 case MVT::i64:
1040 case MVT::f64:
1041 return Opcode_i64;
1042 default:
1043 return std::nullopt;
1044 }
1045}
1046
1047static inline bool isAddLike(const SDValue V) {
1048 return V.getOpcode() == ISD::ADD ||
1049 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1050}
1051
1053 if (N.getOpcode() == ISD::AssertAlign)
1054 N = N.getOperand(0);
1055 return N;
1056}
1057
1058// selectBaseADDR - Match a dag node which will serve as the base address for an
1059// ADDR operand pair.
1061 N = stripAssertAlign(N);
1062 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
1063 return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
1064 GA->getValueType(0), GA->getOffset(),
1065 GA->getTargetFlags());
1066 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
1067 return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
1068 ES->getTargetFlags());
1069 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
1070 return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
1071
1072 return N;
1073}
1074
1076 Addr = stripAssertAlign(Addr);
1077 APInt AccumulatedOffset(64u, 0);
1078 while (isAddLike(Addr)) {
1079 const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1080 if (!CN)
1081 break;
1082
1083 const APInt CI = CN->getAPIntValue().sext(64);
1084 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1085 break;
1086
1087 AccumulatedOffset += CI;
1088 Addr = stripAssertAlign(Addr->getOperand(0));
1089 }
1090 return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
1091 MVT::i32);
1092}
1093
1094static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
1095 SDValue Offset = accumulateOffset(Addr, SDLoc(Addr), DAG);
1096 SDValue Base = selectBaseADDR(Addr, DAG);
1097 return {Base, Offset};
1098}
1099
1100// Select a pair of operands which represent a valid PTX address, this could be
1101// one of the following things:
1102// - [var] - Offset is simply set to 0
1103// - [reg] - Offset is simply set to 0
1104// - [reg+immOff]
1105// - [var+immOff]
1106// Note that immOff must fit into a 32-bit signed integer.
1107bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
1108 SDValue &Offset) {
1109 std::tie(Base, Offset) = selectADDR(Addr, CurDAG);
1110 return true;
1111}
1112
1113bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1114 MemSDNode *LD = cast<MemSDNode>(N);
1115 assert(LD->readMem() && "Expected load");
1116
1117 // do not support pre/post inc/dec
1118 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
1119 if (PlainLoad && PlainLoad->isIndexed())
1120 return false;
1121
1122 // Address Space Setting
1123 const auto CodeAddrSpace = getAddrSpace(LD);
1124 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1125 return tryLDG(LD);
1126
1127 SDLoc DL(LD);
1128 SDValue Chain = N->getOperand(0);
1129 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1130
1131 const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
1132
1133 // Vector Setting
1134 const unsigned FromType =
1135 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1138
1139 uint32_t UsedBytesMask;
1140 switch (N->getOpcode()) {
1141 case ISD::LOAD:
1142 case ISD::ATOMIC_LOAD:
1143 UsedBytesMask = UINT32_MAX;
1144 break;
1145 case NVPTXISD::MLoad:
1146 UsedBytesMask = N->getConstantOperandVal(3);
1147 break;
1148 default:
1149 llvm_unreachable("Unexpected opcode");
1150 }
1151
1152 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1153 FromTypeWidth <= 128 && "Invalid width for load");
1154
1155 // Create the machine instruction DAG
1156 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1157 SDValue Ops[] = {getI32Imm(Ordering, DL),
1158 getI32Imm(Scope, DL),
1159 getI32Imm(CodeAddrSpace, DL),
1160 getI32Imm(FromType, DL),
1161 getI32Imm(FromTypeWidth, DL),
1162 getI32Imm(UsedBytesMask, DL),
1163 Base,
1164 Offset,
1165 Chain};
1166
1167 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1168 const std::optional<unsigned> Opcode =
1169 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1170 if (!Opcode)
1171 return false;
1172
1173 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1174 if (!NVPTXLD)
1175 return false;
1176
1177 MachineMemOperand *MemRef = LD->getMemOperand();
1178 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1179
1180 ReplaceNode(LD, NVPTXLD);
1181 return true;
1182}
1183
1184static unsigned getStoreVectorNumElts(SDNode *N) {
1185 switch (N->getOpcode()) {
1186 case NVPTXISD::StoreV2:
1187 return 2;
1188 case NVPTXISD::StoreV4:
1189 return 4;
1190 case NVPTXISD::StoreV8:
1191 return 8;
1192 default:
1193 llvm_unreachable("Unexpected opcode");
1194 }
1195}
1196
1197bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1198 MemSDNode *LD = cast<MemSDNode>(N);
1199
1200 // Address Space Setting
1201 const auto CodeAddrSpace = getAddrSpace(LD);
1202 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1203 return tryLDG(LD);
1204
1205 const MVT EltVT = LD->getSimpleValueType(0);
1206 SDLoc DL(LD);
1207 SDValue Chain = LD->getChain();
1208 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1209
1210 // Type Setting: fromType + fromTypeWidth
1211 //
1212 // Sign : ISD::SEXTLOAD
1213 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1214 // type is integer
1215 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1216 // Read at least 8 bits (predicates are stored as 8-bit values)
1217 // Get the original LoadSDNode::getExtensionType() value
1218 const unsigned ExtensionType = N->getConstantOperandVal(4);
1219 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1221 : NVPTX::PTXLdStInstCode::Untyped;
1222
1223 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1224 const uint32_t UsedBytesMask = N->getConstantOperandVal(3);
1225
1226 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1227
1228 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1229 SDValue Ops[] = {getI32Imm(Ordering, DL),
1230 getI32Imm(Scope, DL),
1231 getI32Imm(CodeAddrSpace, DL),
1232 getI32Imm(FromType, DL),
1233 getI32Imm(FromTypeWidth, DL),
1234 getI32Imm(UsedBytesMask, DL),
1235 Base,
1236 Offset,
1237 Chain};
1238
1239 std::optional<unsigned> Opcode;
1240 switch (N->getOpcode()) {
1241 default:
1242 llvm_unreachable("Unexpected opcode");
1243 case NVPTXISD::LoadV2:
1244 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v2,
1245 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1246 break;
1247 case NVPTXISD::LoadV4:
1248 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v4,
1249 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1250 break;
1251 case NVPTXISD::LoadV8:
1252 Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i16 */},
1253 NVPTX::LDV_i32_v8, {/* no v8i64 */});
1254 break;
1255 }
1256 if (!Opcode)
1257 return false;
1258
1259 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1260
1261 MachineMemOperand *MemRef = LD->getMemOperand();
1262 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1263
1264 ReplaceNode(LD, NVPTXLD);
1265 return true;
1266}
1267
1268bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1269 SDLoc DL(LD);
1270
1271 unsigned ExtensionType;
1272 uint32_t UsedBytesMask;
1273 if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
1274 ExtensionType = Load->getExtensionType();
1275 UsedBytesMask = UINT32_MAX;
1276 } else {
1277 ExtensionType = LD->getConstantOperandVal(4);
1278 UsedBytesMask = LD->getConstantOperandVal(3);
1279 }
1280 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1282 : NVPTX::PTXLdStInstCode::Untyped;
1283
1284 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1285
1286 assert(!(LD->getSimpleValueType(0).isVector() &&
1287 ExtensionType != ISD::NON_EXTLOAD));
1288
1289 const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG);
1290 SDValue Ops[] = {getI32Imm(FromType, DL),
1291 getI32Imm(FromTypeWidth, DL),
1292 getI32Imm(UsedBytesMask, DL),
1293 Base,
1294 Offset,
1295 LD->getChain()};
1296
1297 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1298 std::optional<unsigned> Opcode;
1299 switch (LD->getOpcode()) {
1300 default:
1301 llvm_unreachable("Unexpected opcode");
1302 case ISD::LOAD:
1303 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i16,
1304 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1305 break;
1306 case NVPTXISD::MLoad:
1307 Opcode = pickOpcodeForVT(TargetVT, std::nullopt, NVPTX::LD_GLOBAL_NC_i32,
1308 NVPTX::LD_GLOBAL_NC_i64);
1309 break;
1310 case NVPTXISD::LoadV2:
1311 Opcode =
1312 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v2i16,
1313 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1314 break;
1315 case NVPTXISD::LoadV4:
1316 Opcode =
1317 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v4i16,
1318 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1319 break;
1320 case NVPTXISD::LoadV8:
1321 Opcode = pickOpcodeForVT(TargetVT, {/* no v8i16 */},
1322 NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
1323 break;
1324 }
1325 if (!Opcode)
1326 return false;
1327
1328 SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1329
1330 ReplaceNode(LD, NVPTXLDG);
1331 return true;
1332}
1333
1335 auto TotalWidth = Mem->getMemoryVT().getSizeInBits();
1336 auto NumElts = Mem->getNumValues() - 1;
1337 auto ElementBitWidth = TotalWidth / NumElts;
1338 assert(isPowerOf2_32(ElementBitWidth) && ElementBitWidth >= 8 &&
1339 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1340 "Invalid width for load");
1341 return ElementBitWidth;
1342}
1343
1344bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1345 auto *LD = cast<MemSDNode>(N);
1346
1347 SDLoc DL(N);
1348 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1349 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1350
1351 // If this is an LDU intrinsic, the address is the third operand. If its an
1352 // LDU SD node (from custom vector handling), then its the second operand
1353 SDValue Addr =
1354 LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1355
1356 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1357 SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()};
1358
1359 std::optional<unsigned> Opcode;
1360 switch (N->getOpcode()) {
1361 default:
1362 llvm_unreachable("Unexpected opcode");
1364 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_i16,
1365 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1366 break;
1367 case NVPTXISD::LDUV2:
1368 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v2i16,
1369 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1370 break;
1371 case NVPTXISD::LDUV4:
1372 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v4i16,
1373 NVPTX::LDU_GLOBAL_v4i32, {/* no v4i64 */});
1374 break;
1375 }
1376 if (!Opcode)
1377 return false;
1378
1379 SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1380
1381 ReplaceNode(LD, NVPTXLDU);
1382 return true;
1383}
1384
1385bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1386 MemSDNode *ST = cast<MemSDNode>(N);
1387 assert(ST->writeMem() && "Expected store");
1388 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
1389 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
1390 assert((PlainStore || AtomicStore) && "Expected store");
1391
1392 // do not support pre/post inc/dec
1393 if (PlainStore && PlainStore->isIndexed())
1394 return false;
1395
1396 // Address Space Setting
1397 const auto CodeAddrSpace = getAddrSpace(ST);
1398
1399 SDLoc DL(ST);
1400 SDValue Chain = ST->getChain();
1401 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1402
1403 // Vector Setting
1404 const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
1405
1406 // Create the machine instruction DAG
1407 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1408
1409 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1410 "Invalid width for store");
1411
1412 const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
1413 SDValue Ops[] = {selectPossiblyImm(Value),
1414 getI32Imm(Ordering, DL),
1415 getI32Imm(Scope, DL),
1416 getI32Imm(CodeAddrSpace, DL),
1417 getI32Imm(ToTypeWidth, DL),
1418 Base,
1419 Offset,
1420 Chain};
1421
1422 const std::optional<unsigned> Opcode =
1423 pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i16,
1424 NVPTX::ST_i32, NVPTX::ST_i64);
1425 if (!Opcode)
1426 return false;
1427
1428 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1429
1430 if (!NVPTXST)
1431 return false;
1432
1433 MachineMemOperand *MemRef = ST->getMemOperand();
1434 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1435 ReplaceNode(ST, NVPTXST);
1436 return true;
1437}
1438
1439bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1440 MemSDNode *ST = cast<MemSDNode>(N);
1441 const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
1442
1443 // Address Space Setting
1444 const auto CodeAddrSpace = getAddrSpace(ST);
1445 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1446 report_fatal_error("Cannot store to pointer that points to constant "
1447 "memory space");
1448 }
1449
1450 SDLoc DL(ST);
1451 SDValue Chain = ST->getChain();
1452 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1453
1454 const unsigned NumElts = getStoreVectorNumElts(ST);
1455
1457 for (auto &V : ST->ops().slice(1, NumElts))
1458 Ops.push_back(selectPossiblyImm(V));
1459 SDValue Addr = N->getOperand(NumElts + 1);
1460 const unsigned ToTypeWidth = TotalWidth / NumElts;
1461
1462 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1463 TotalWidth <= 256 && "Invalid width for store");
1464
1465 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1466 Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1467 getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
1468 Offset, Chain});
1469
1470 const MVT::SimpleValueType EltVT =
1471 ST->getOperand(1).getSimpleValueType().SimpleTy;
1472 std::optional<unsigned> Opcode;
1473 switch (ST->getOpcode()) {
1474 default:
1475 return false;
1476 case NVPTXISD::StoreV2:
1477 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
1478 NVPTX::STV_i64_v2);
1479 break;
1480 case NVPTXISD::StoreV4:
1481 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
1482 NVPTX::STV_i64_v4);
1483 break;
1484 case NVPTXISD::StoreV8:
1485 Opcode = pickOpcodeForVT(EltVT, {/* no v8i16 */}, NVPTX::STV_i32_v8,
1486 {/* no v8i64 */});
1487 break;
1488 }
1489
1490 if (!Opcode)
1491 return false;
1492
1493 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1494
1495 MachineMemOperand *MemRef = ST->getMemOperand();
1496 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1497
1498 ReplaceNode(ST, NVPTXST);
1499 return true;
1500}
1501
1502/// SelectBFE - Look for instruction sequences that can be made more efficient
1503/// by using the 'bfe' (bit-field extract) PTX instruction
1504bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1505 SDLoc DL(N);
1506 SDValue LHS = N->getOperand(0);
1507 SDValue RHS = N->getOperand(1);
1508 SDValue Len;
1509 SDValue Start;
1510 SDValue Val;
1511 bool IsSigned = false;
1512
1513 if (N->getOpcode() == ISD::AND) {
1514 // Canonicalize the operands
1515 // We want 'and %val, %mask'
1517 std::swap(LHS, RHS);
1518 }
1519
1520 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
1521 if (!Mask) {
1522 // We need a constant mask on the RHS of the AND
1523 return false;
1524 }
1525
1526 // Extract the mask bits
1527 uint64_t MaskVal = Mask->getZExtValue();
1528 if (!isMask_64(MaskVal)) {
1529 // We *could* handle shifted masks here, but doing so would require an
1530 // 'and' operation to fix up the low-order bits so we would trade
1531 // shr+and for bfe+and, which has the same throughput
1532 return false;
1533 }
1534
1535 // How many bits are in our mask?
1536 int64_t NumBits = countr_one(MaskVal);
1537 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1538
1539 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1540 // We have a 'srl/and' pair, extract the effective start bit and length
1541 Val = LHS.getNode()->getOperand(0);
1542 Start = LHS.getNode()->getOperand(1);
1543 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
1544 if (StartConst) {
1545 uint64_t StartVal = StartConst->getZExtValue();
1546 // How many "good" bits do we have left? "good" is defined here as bits
1547 // that exist in the original value, not shifted in.
1548 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1549 if (NumBits > GoodBits) {
1550 // Do not handle the case where bits have been shifted in. In theory
1551 // we could handle this, but the cost is likely higher than just
1552 // emitting the srl/and pair.
1553 return false;
1554 }
1555 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
1556 } else {
1557 // Do not handle the case where the shift amount (can be zero if no srl
1558 // was found) is not constant. We could handle this case, but it would
1559 // require run-time logic that would be more expensive than just
1560 // emitting the srl/and pair.
1561 return false;
1562 }
1563 } else {
1564 // Do not handle the case where the LHS of the and is not a shift. While
1565 // it would be trivial to handle this case, it would just transform
1566 // 'and' -> 'bfe', but 'and' has higher-throughput.
1567 return false;
1568 }
1569 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1570 if (LHS->getOpcode() == ISD::AND) {
1571 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
1572 if (!ShiftCnst) {
1573 // Shift amount must be constant
1574 return false;
1575 }
1576
1577 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1578
1579 SDValue AndLHS = LHS->getOperand(0);
1580 SDValue AndRHS = LHS->getOperand(1);
1581
1582 // Canonicalize the AND to have the mask on the RHS
1583 if (isa<ConstantSDNode>(AndLHS)) {
1584 std::swap(AndLHS, AndRHS);
1585 }
1586
1587 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
1588 if (!MaskCnst) {
1589 // Mask must be constant
1590 return false;
1591 }
1592
1593 uint64_t MaskVal = MaskCnst->getZExtValue();
1594 uint64_t NumZeros;
1595 uint64_t NumBits;
1596 if (isMask_64(MaskVal)) {
1597 NumZeros = 0;
1598 // The number of bits in the result bitfield will be the number of
1599 // trailing ones (the AND) minus the number of bits we shift off
1600 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
1601 } else if (isShiftedMask_64(MaskVal)) {
1602 NumZeros = llvm::countr_zero(MaskVal);
1603 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
1604 // The number of bits in the result bitfield will be the number of
1605 // trailing zeros plus the number of set bits in the mask minus the
1606 // number of bits we shift off
1607 NumBits = NumZeros + NumOnes - ShiftAmt;
1608 } else {
1609 // This is not a mask we can handle
1610 return false;
1611 }
1612
1613 if (ShiftAmt < NumZeros) {
1614 // Handling this case would require extra logic that would make this
1615 // transformation non-profitable
1616 return false;
1617 }
1618
1619 Val = AndLHS;
1620 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
1621 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1622
1623 // If pre-shift AND includes the sign bit in the bitfield, we must use
1624 // signed BFE to replicate that bit during bitfield extraction. If the
1625 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1626 // of the result
1627 if (N->getOpcode() == ISD::SRA)
1628 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1629 } else if (LHS->getOpcode() == ISD::SHL) {
1630 // Here, we have a pattern like:
1631 //
1632 // (sra (shl val, NN), MM)
1633 // or
1634 // (srl (shl val, NN), MM)
1635 //
1636 // If MM >= NN, we can efficiently optimize this with bfe
1637 Val = LHS->getOperand(0);
1638
1639 SDValue ShlRHS = LHS->getOperand(1);
1640 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
1641 if (!ShlCnst) {
1642 // Shift amount must be constant
1643 return false;
1644 }
1645 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1646
1647 SDValue ShrRHS = RHS;
1648 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
1649 if (!ShrCnst) {
1650 // Shift amount must be constant
1651 return false;
1652 }
1653 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1654
1655 // To avoid extra codegen and be profitable, we need Outer >= Inner
1656 if (OuterShiftAmt < InnerShiftAmt) {
1657 return false;
1658 }
1659
1660 // If the outer shift is more than the type size, we have no bitfield to
1661 // extract (since we also check that the inner shift is <= the outer shift
1662 // then this also implies that the inner shift is < the type size)
1663 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1664 return false;
1665 }
1666
1667 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
1668 MVT::i32);
1669 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
1670 DL, MVT::i32);
1671
1672 if (N->getOpcode() == ISD::SRA) {
1673 // If we have a arithmetic right shift, we need to use the signed bfe
1674 // variant
1675 IsSigned = true;
1676 }
1677 } else {
1678 // No can do...
1679 return false;
1680 }
1681 } else {
1682 // No can do...
1683 return false;
1684 }
1685
1686
1687 unsigned Opc;
1688 // For the BFE operations we form here from "and" and "srl", always use the
1689 // unsigned variants.
1690 if (Val.getValueType() == MVT::i32) {
1691 if (IsSigned) {
1692 Opc = NVPTX::BFE_S32rii;
1693 } else {
1694 Opc = NVPTX::BFE_U32rii;
1695 }
1696 } else if (Val.getValueType() == MVT::i64) {
1697 if (IsSigned) {
1698 Opc = NVPTX::BFE_S64rii;
1699 } else {
1700 Opc = NVPTX::BFE_U64rii;
1701 }
1702 } else {
1703 // We cannot handle this type
1704 return false;
1705 }
1706
1707 SDValue Ops[] = {
1708 Val, Start, Len
1709 };
1710
1711 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
1712 return true;
1713}
1714
1715// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1716bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1717 EVT VT = SDValue(N, 0).getValueType();
1718 if (VT.getScalarType() != MVT::bf16)
1719 return false;
1720
1721 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1722 if (STI->hasNativeBF16Support(N->getOpcode()))
1723 return false;
1724
1725 const bool IsVec = VT.isVector();
1726 assert(!IsVec || VT.getVectorNumElements() == 2);
1727 SDLoc DL(N);
1728 SDValue N0 = N->getOperand(0);
1729 SDValue N1 = N->getOperand(1);
1730 SmallVector<SDValue, 3> Operands;
1731 auto GetConstant = [&](float Value) -> SDValue {
1732 // BF16 immediates must be legalized to integer register values
1733 APFloat APF(Value);
1734 bool LosesInfo;
1735 APF.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &LosesInfo);
1736 assert(!LosesInfo);
1737 if (IsVec) {
1738 auto API = APF.bitcastToAPInt();
1739 API = API.concat(API);
1740 auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
1741 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_B32_i, DL, VT, Const),
1742 0);
1743 }
1744 auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
1745 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_BF16_i, DL, VT, Const), 0);
1746 };
1747
1748 switch (N->getOpcode()) {
1749 case ISD::FADD:
1750 // add(a, b) -> fma(a, 1.0, b)
1751 Operands = {N0, GetConstant(1.0), N1};
1752 break;
1753 case ISD::FSUB:
1754 // sub(a, b) -> fma(b, -1.0, a)
1755 Operands = {N1, GetConstant(-1.0), N0};
1756 break;
1757 case ISD::FMUL:
1758 // mul(a, b) -> fma(a, b, -0.0)
1759 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1760 Operands = {N0, N1, GetConstant(-0.0)};
1761 break;
1762 default:
1763 llvm_unreachable("Unexpected opcode");
1764 };
1765
1766 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1767 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, DL, VT, Operands);
1768 ReplaceNode(N, FMA);
1769 return true;
1770}
1771
1772SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
1773 if (V.getOpcode() == ISD::BITCAST)
1774 V = V.getOperand(0);
1775
1776 if (auto *CN = dyn_cast<ConstantSDNode>(V))
1777 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1778 V.getValueType());
1779 if (auto *CN = dyn_cast<ConstantFPSDNode>(V))
1780 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1781 V.getValueType());
1782 return V;
1783}
1784
1785/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
1786/// inline asm expressions.
1788 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
1789 std::vector<SDValue> &OutOps) {
1790 switch (ConstraintID) {
1791 default:
1792 return true;
1793 case InlineAsm::ConstraintCode::m: { // memory
1794 const auto [Base, Offset] = selectADDR(Op, CurDAG);
1795 OutOps.push_back(Base);
1796 OutOps.push_back(Offset);
1797 return false;
1798 }
1799 }
1800 return true;
1801}
1802
1803void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
1804 // Lower a CopyToReg with two 64-bit inputs
1805 // Dst:i128, lo:i64, hi:i64
1806 //
1807 // CopyToReg Dst, lo, hi;
1808 //
1809 // ==>
1810 //
1811 // tmp = V2I64toI128 {lo, hi};
1812 // CopyToReg Dst, tmp;
1813 SDValue Dst = N->getOperand(1);
1814 SDValue Lo = N->getOperand(2);
1815 SDValue Hi = N->getOperand(3);
1816
1817 SDLoc DL(N);
1818 SDNode *Mov =
1819 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
1820
1821 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
1822 NewOps[0] = N->getOperand(0);
1823 NewOps[1] = Dst;
1824 NewOps[2] = SDValue(Mov, 0);
1825 if (N->getNumOperands() == 5)
1826 NewOps[3] = N->getOperand(4);
1827 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
1828
1829 ReplaceNode(N, NewValue.getNode());
1830}
1831
1832void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
1833 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
1834 // Dst:i128, Src:i128
1835 //
1836 // {lo, hi} = CopyFromReg Src
1837 //
1838 // ==>
1839 //
1840 // {lo, hi} = I128toV2I64 Src
1841 //
1842 SDValue Ch = N->getOperand(0);
1843 SDValue Src = N->getOperand(1);
1844 SDValue Glue = N->getOperand(2);
1845 SDLoc DL(N);
1846
1847 // Add Glue and Ch to the operands and results to avoid break the execution
1848 // order
1849 SDNode *Mov = CurDAG->getMachineNode(
1850 NVPTX::I128toV2I64, DL,
1851 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
1852 {Src, Ch, Glue});
1853
1854 ReplaceNode(N, Mov);
1855}
1856
1857bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
1858 SDLoc DL(N);
1859 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
1860 unsigned int FenceOp =
1861 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
1862 Scopes[N->getConstantOperandVal(2)], Subtarget);
1863 SDValue Chain = N->getOperand(0);
1864 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
1865 ReplaceNode(N, FenceNode);
1866 return true;
1867}
1868
1870 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
1871 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
1872 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
1873 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
1874 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
1875}
1876
1878 if (Scopes.empty())
1879 llvm_unreachable("NVPTX Scopes must be initialized before calling "
1880 "NVPTXScopes::operator[]");
1881
1882 auto S = Scopes.find(ID);
1883 if (S == Scopes.end()) {
1884 auto scopeName = Context->getSyncScopeName(ID);
1885 assert(scopeName.has_value() && "Scope name must exist.");
1886
1887 // Build list of supported syncscopes programmatically
1888 SmallVector<StringRef> supportedScopes;
1889 for (const auto &Entry : Scopes) {
1890 if (auto name = Context->getSyncScopeName(Entry.first))
1891 supportedScopes.push_back(name->empty() ? "<empty string>" : *name);
1892 }
1893
1895 formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
1896 "Supported syncscopes are: {2}.",
1897 scopeName.value(), int(ID),
1898 make_range(supportedScopes.begin(), supportedScopes.end())));
1899 }
1900 return S->second;
1901}
1902
1903bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
1904
1905#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1906 (is_s32 \
1907 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1908 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1909
1910#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1911 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1912 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1913
1915 bool IsShared32,
1916 bool IsCacheHint,
1917 bool IsIm2Col) {
1918 if (IsIm2Col) {
1919 switch (Dim) {
1920 case 3:
1921 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
1922 IsShared32);
1923 case 4:
1924 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
1925 IsShared32);
1926 case 5:
1927 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
1928 IsShared32);
1929 default:
1930 llvm_unreachable("Invalid Dimension in im2col mode for "
1931 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1932 }
1933 } else {
1934 switch (Dim) {
1935 case 1:
1936 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
1937 IsShared32);
1938 case 2:
1939 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
1940 IsShared32);
1941 case 3:
1942 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
1943 IsShared32);
1944 case 4:
1945 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
1946 IsShared32);
1947 case 5:
1948 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
1949 IsShared32);
1950 default:
1951 llvm_unreachable("Invalid Dimension in tile mode for "
1952 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1953 }
1954 }
1955}
1956
1957void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
1958 unsigned RedOp,
1959 bool IsIm2Col) {
1960 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1961 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
1962 // NumOperands = {Chain, IID} + {Actual intrinsic args}
1963 // = {2} + {4 + dims}
1964 size_t NumOps = N->getNumOperands();
1965 size_t NumDims = NumOps - 6;
1966 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
1967 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
1968
1969 SDLoc DL(N);
1970 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
1971 Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
1972 Ops.push_back(N->getOperand(0)); // Chain operand
1973
1974 bool IsShared32 =
1975 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
1977 NumDims, IsShared32, IsCacheHint, IsIm2Col);
1978 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
1979}
1980
1981#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1982 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1983 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1984
1985static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
1986 switch (IID) {
1987 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1988 return TCGEN05_ST_OPCODE(16x64b, x1);
1989 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1990 return TCGEN05_ST_OPCODE(16x64b, x2);
1991 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
1992 return TCGEN05_ST_OPCODE(16x64b, x4);
1993 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
1994 return TCGEN05_ST_OPCODE(16x64b, x8);
1995 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
1996 return TCGEN05_ST_OPCODE(16x64b, x16);
1997 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
1998 return TCGEN05_ST_OPCODE(16x64b, x32);
1999 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2000 return TCGEN05_ST_OPCODE(16x64b, x64);
2001 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2002 return TCGEN05_ST_OPCODE(16x64b, x128);
2003 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2004 return TCGEN05_ST_OPCODE(16x128b, x1);
2005 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2006 return TCGEN05_ST_OPCODE(16x128b, x2);
2007 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2008 return TCGEN05_ST_OPCODE(16x128b, x4);
2009 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2010 return TCGEN05_ST_OPCODE(16x128b, x8);
2011 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2012 return TCGEN05_ST_OPCODE(16x128b, x16);
2013 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2014 return TCGEN05_ST_OPCODE(16x128b, x32);
2015 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2016 return TCGEN05_ST_OPCODE(16x128b, x64);
2017 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2018 return TCGEN05_ST_OPCODE(16x256b, x1);
2019 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2020 return TCGEN05_ST_OPCODE(16x256b, x2);
2021 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2022 return TCGEN05_ST_OPCODE(16x256b, x4);
2023 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2024 return TCGEN05_ST_OPCODE(16x256b, x8);
2025 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2026 return TCGEN05_ST_OPCODE(16x256b, x16);
2027 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2028 return TCGEN05_ST_OPCODE(16x256b, x32);
2029 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2030 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2031 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2032 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2033 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2034 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2035 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2036 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2037 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2038 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2039 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2040 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2041 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2042 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2043 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2044 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2045 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2046 return TCGEN05_ST_OPCODE(32x32b, x1);
2047 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2048 return TCGEN05_ST_OPCODE(32x32b, x2);
2049 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2050 return TCGEN05_ST_OPCODE(32x32b, x4);
2051 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2052 return TCGEN05_ST_OPCODE(32x32b, x8);
2053 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2054 return TCGEN05_ST_OPCODE(32x32b, x16);
2055 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2056 return TCGEN05_ST_OPCODE(32x32b, x32);
2057 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2058 return TCGEN05_ST_OPCODE(32x32b, x64);
2059 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2060 return TCGEN05_ST_OPCODE(32x32b, x128);
2061 }
2062 llvm_unreachable("unhandled tcgen05.st lowering");
2063}
2064
2065void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2066 if (!Subtarget->hasTcgen05InstSupport())
2068 "tcgen05.st is not supported on this architecture variant");
2069
2070 SDLoc DL(N);
2071 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2072
2073 SmallVector<SDValue, 128> Operands = {
2074 N->getOperand(2) // taddr
2075 };
2076
2077 if (hasOffset)
2078 Operands.push_back(CurDAG->getTargetConstant(
2079 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL,
2080 MVT::i32)); // Offset
2081
2082 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2083 Operands.push_back(N->getOperand(I));
2084
2085 bool enableUnpack =
2086 cast<ConstantSDNode>(N->getOperand(N->getNumOperands() - 1))
2087 ->getZExtValue();
2088
2089 Operands.push_back(N->getOperand(0)); // Chain
2090 ReplaceNode(N, CurDAG->getMachineNode(getTcgen05StOpcode(IID, enableUnpack),
2091 DL, N->getVTList(), Operands));
2092}
2093
2094bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2095 unsigned IID = N->getConstantOperandVal(1);
2096 using TMARedTy = llvm::nvvm::TMAReductionOp;
2097 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2098 switch (IID) {
2099 default:
2100 return false;
2101 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2102 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2103 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2104 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2105 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2106 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
2107 return true;
2108 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2109 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2110 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2111 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
2112 /*IsIm2Col=*/true);
2113 return true;
2114 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2115 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2119 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
2120 return true;
2121 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2122 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2123 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2124 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
2125 /*IsIm2Col=*/true);
2126 return true;
2127 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2128 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2132 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
2133 return true;
2134 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2135 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2136 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2137 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
2138 /*IsIm2Col=*/true);
2139 return true;
2140 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2141 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2145 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
2146 return true;
2147 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2148 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2149 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2150 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
2151 /*IsIm2Col=*/true);
2152 return true;
2153 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2154 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2158 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
2159 return true;
2160 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2163 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
2164 /*IsIm2Col=*/true);
2165 return true;
2166 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2171 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
2172 return true;
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2176 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
2177 /*IsIm2Col=*/true);
2178 return true;
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2184 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
2185 return true;
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2189 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
2190 /*IsIm2Col=*/true);
2191 return true;
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2195 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2197 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
2198 return true;
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2202 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
2203 /*IsIm2Col=*/true);
2204 return true;
2205
2206 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2207 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2208 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2209 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2210 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2211 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2212 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2213 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2214 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2215 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2216 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2217 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2218 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2219 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2220 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2221 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2222 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2223 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2224 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2225 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2226 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2227 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2228 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2229 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2230 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2231 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2232 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2233 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2234 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2235 SelectTcgen05St(N);
2236 return true;
2237 }
2238
2239 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2240 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2241 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2242 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2243 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2244 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2245 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2246 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2247 SelectTcgen05St(N, /* hasOffset */ true);
2248 return true;
2249 }
2250 }
2251}
2252
2253void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
2254 MemSDNode *AN = cast<MemSDNode>(N);
2255 SDLoc dl(N);
2256
2257 const SDValue Chain = N->getOperand(0);
2258 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
2260 Ops.append(N->op_begin() + 2, N->op_end());
2261 Ops.append({
2262 getI32Imm(getMemOrder(AN), dl),
2263 getI32Imm(getAtomicScope(AN), dl),
2264 getI32Imm(getAddrSpace(AN), dl),
2265 Chain,
2266 });
2267
2268 assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
2269 N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
2270 unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
2271 ? NVPTX::ATOM_EXCH_B128
2272 : NVPTX::ATOM_CAS_B128;
2273
2274 auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
2275 CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
2276
2277 ReplaceNode(N, ATOM);
2278}
2279
2280void NVPTXDAGToDAGISel::selectBR_JT(SDNode *N) {
2281 assert(Subtarget->hasBrx() &&
2282 "BR_JT should be expanded during legalization on unsupported targets");
2283
2284 SDLoc DL(N);
2285 const SDValue InChain = N->getOperand(0);
2286 const auto *JT = cast<JumpTableSDNode>(N->getOperand(1));
2287 const SDValue Index = N->getOperand(2);
2288
2289 unsigned JId = JT->getIndex();
2290 MachineJumpTableInfo *MJTI = CurDAG->getMachineFunction().getJumpTableInfo();
2291 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2292
2293 SDValue IdV = getI32Imm(JId, DL);
2294
2295 // Generate BrxStart node
2296 MachineSDNode *Chain = CurDAG->getMachineNode(
2297 NVPTX::BRX_START, DL, {MVT::Other, MVT::Glue}, {IdV, InChain});
2298
2299 // Generate BrxItem nodes
2300 assert(!MBBs.empty());
2301 for (MachineBasicBlock *MBB : MBBs.drop_back())
2302 Chain = CurDAG->getMachineNode(
2303 NVPTX::BRX_ITEM, DL, {MVT::Other, MVT::Glue},
2304 {CurDAG->getBasicBlock(MBB), SDValue(Chain, 0), SDValue(Chain, 1)});
2305
2306 // Generate BrxEnd nodes
2307 MachineSDNode *BrxEnd =
2308 CurDAG->getMachineNode(NVPTX::BRX_END, DL, MVT::Other,
2309 {CurDAG->getBasicBlock(MBBs.back()), Index, IdV,
2310 SDValue(Chain, 0), SDValue(Chain, 1)});
2311
2312 ReplaceNode(N, BrxEnd);
2313}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define DEBUG_TYPE
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
#define T
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static const char * name
#define PASS_NAME
Value * RHS
Value * LHS
static const fltSemantics & BFloat()
Definition APFloat.h:295
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:201
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:981
@ BR_JT
BR_JT - Jumptable branch.
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
Definition ISDOpcodes.h:69
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:985
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
@ SharedCluster
Definition NVPTX.h:192
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:257
@ DefaultDevice
Definition NVPTX.h:181
@ RelaxedMMIO
Definition NVPTX.h:171
@ AcquireRelease
Definition NVPTX.h:167
@ NotAtomic
Definition NVPTX.h:160
@ SequentiallyConsistent
Definition NVPTX.h:168
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
NVPTXScopes()=default
NVPTX::Scope operator[](SyncScope::ID ID) const