LLVM 19.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
33
34#ifdef EXPENSIVE_CHECKS
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Srl.getOperand(0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(1);
86 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
103 false)
107#ifdef EXPENSIVE_CHECKS
110#endif
112 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
113 false)
114
115/// This pass converts a legalized DAG into a AMDGPU-specific
116// DAG, ready for instruction scheduling.
118 CodeGenOptLevel OptLevel) {
119 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
120}
121
123 CodeGenOptLevel OptLevel)
124 : SelectionDAGISel(TM, OptLevel) {
125 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
126}
127
129 Subtarget = &MF.getSubtarget<GCNSubtarget>();
131 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
133}
134
135bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
136 // XXX - only need to list legal operations.
137 switch (Opc) {
138 case ISD::FADD:
139 case ISD::FSUB:
140 case ISD::FMUL:
141 case ISD::FDIV:
142 case ISD::FREM:
144 case ISD::UINT_TO_FP:
145 case ISD::SINT_TO_FP:
146 case ISD::FABS:
147 // Fabs is lowered to a bit operation, but it's an and which will clear the
148 // high bits anyway.
149 case ISD::FSQRT:
150 case ISD::FSIN:
151 case ISD::FCOS:
152 case ISD::FPOWI:
153 case ISD::FPOW:
154 case ISD::FLOG:
155 case ISD::FLOG2:
156 case ISD::FLOG10:
157 case ISD::FEXP:
158 case ISD::FEXP2:
159 case ISD::FCEIL:
160 case ISD::FTRUNC:
161 case ISD::FRINT:
162 case ISD::FNEARBYINT:
163 case ISD::FROUNDEVEN:
164 case ISD::FROUND:
165 case ISD::FFLOOR:
166 case ISD::FMINNUM:
167 case ISD::FMAXNUM:
168 case ISD::FLDEXP:
169 case AMDGPUISD::FRACT:
170 case AMDGPUISD::CLAMP:
173 case AMDGPUISD::FMIN3:
174 case AMDGPUISD::FMAX3:
175 case AMDGPUISD::FMED3:
177 case AMDGPUISD::RCP:
178 case AMDGPUISD::RSQ:
180 // On gfx10, all 16-bit instructions preserve the high bits.
181 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
182 case ISD::FP_ROUND:
183 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
184 // high bits on gfx9.
185 // TODO: If we had the source node we could see if the source was fma/mad
187 case ISD::FMA:
188 case ISD::FMAD:
191 default:
192 // fcopysign, select and others may be lowered to 32-bit bit operations
193 // which don't zero the high bits.
194 return false;
195 }
196}
197
199#ifdef EXPENSIVE_CHECKS
200 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
201 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
202 for (auto &L : LI->getLoopsInPreorder()) {
203 assert(L->isLCSSAForm(DT));
204 }
205#endif
207}
208
212#ifdef EXPENSIVE_CHECKS
215#endif
217}
218
220 assert(Subtarget->d16PreservesUnusedBits());
221 MVT VT = N->getValueType(0).getSimpleVT();
222 if (VT != MVT::v2i16 && VT != MVT::v2f16)
223 return false;
224
225 SDValue Lo = N->getOperand(0);
226 SDValue Hi = N->getOperand(1);
227
228 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
229
230 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
231 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
232 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
233
234 // Need to check for possible indirect dependencies on the other half of the
235 // vector to avoid introducing a cycle.
236 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
237 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
238
240 SDValue Ops[] = {
241 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
242 };
243
244 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
245 if (LdHi->getMemoryVT() == MVT::i8) {
246 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
248 } else {
249 assert(LdHi->getMemoryVT() == MVT::i16);
250 }
251
252 SDValue NewLoadHi =
253 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
254 Ops, LdHi->getMemoryVT(),
255 LdHi->getMemOperand());
256
257 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
258 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
259 return true;
260 }
261
262 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
263 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
264 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
265 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
266 if (LdLo && Lo.hasOneUse()) {
267 SDValue TiedIn = getHi16Elt(Hi);
268 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
269 return false;
270
271 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
272 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
273 if (LdLo->getMemoryVT() == MVT::i8) {
274 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
276 } else {
277 assert(LdLo->getMemoryVT() == MVT::i16);
278 }
279
280 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
281
282 SDValue Ops[] = {
283 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
284 };
285
286 SDValue NewLoadLo =
287 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
288 Ops, LdLo->getMemoryVT(),
289 LdLo->getMemOperand());
290
291 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
292 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
293 return true;
294 }
295
296 return false;
297}
298
300 if (!Subtarget->d16PreservesUnusedBits())
301 return;
302
304
305 bool MadeChange = false;
306 while (Position != CurDAG->allnodes_begin()) {
307 SDNode *N = &*--Position;
308 if (N->use_empty())
309 continue;
310
311 switch (N->getOpcode()) {
313 // TODO: Match load d16 from shl (extload:i16), 16
314 MadeChange |= matchLoadD16FromBuildVector(N);
315 break;
316 default:
317 break;
318 }
319 }
320
321 if (MadeChange) {
323 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
324 CurDAG->dump(););
325 }
326}
327
328bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
329 if (N->isUndef())
330 return true;
331
332 const SIInstrInfo *TII = Subtarget->getInstrInfo();
333 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
334 return TII->isInlineConstant(C->getAPIntValue());
335
336 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
337 return TII->isInlineConstant(C->getValueAPF());
338
339 return false;
340}
341
342/// Determine the register class for \p OpNo
343/// \returns The register class of the virtual register that will be used for
344/// the given operand number \OpNo or NULL if the register class cannot be
345/// determined.
346const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347 unsigned OpNo) const {
348 if (!N->isMachineOpcode()) {
349 if (N->getOpcode() == ISD::CopyToReg) {
350 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351 if (Reg.isVirtual()) {
353 return MRI.getRegClass(Reg);
354 }
355
356 const SIRegisterInfo *TRI
357 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358 return TRI->getPhysRegBaseClass(Reg);
359 }
360
361 return nullptr;
362 }
363
364 switch (N->getMachineOpcode()) {
365 default: {
366 const MCInstrDesc &Desc =
367 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368 unsigned OpIdx = Desc.getNumDefs() + OpNo;
369 if (OpIdx >= Desc.getNumOperands())
370 return nullptr;
371 int RegClass = Desc.operands()[OpIdx].RegClass;
372 if (RegClass == -1)
373 return nullptr;
374
375 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376 }
377 case AMDGPU::REG_SEQUENCE: {
378 unsigned RCID = N->getConstantOperandVal(0);
379 const TargetRegisterClass *SuperRC =
380 Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382 SDValue SubRegOp = N->getOperand(OpNo + 1);
383 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
384 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385 SubRegIdx);
386 }
387 }
388}
389
390SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391 SDValue Glue) const {
392 SmallVector <SDValue, 8> Ops;
393 Ops.push_back(NewChain); // Replace the chain.
394 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395 Ops.push_back(N->getOperand(i));
396
397 Ops.push_back(Glue);
398 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399}
400
401SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
403 *static_cast<const SITargetLowering*>(getTargetLowering());
404
405 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406
407 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408 return glueCopyToOp(N, M0, M0.getValue(1));
409}
410
411SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414 if (Subtarget->ldsRequiresM0Init())
415 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
418 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419 return
420 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421 }
422 return N;
423}
424
425MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426 EVT VT) const {
428 AMDGPU::S_MOV_B32, DL, MVT::i32,
429 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430 SDNode *Hi =
431 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433 const SDValue Ops[] = {
434 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437
438 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439}
440
441void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442 EVT VT = N->getValueType(0);
443 unsigned NumVectorElts = VT.getVectorNumElements();
444 EVT EltVT = VT.getVectorElementType();
445 SDLoc DL(N);
446 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447
448 if (NumVectorElts == 1) {
449 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450 RegClass);
451 return;
452 }
453
454 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455 "supported yet");
456 // 32 = Max Num Vector Elements
457 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458 // 1 = Vector Register Class
459 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460
461 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
463 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464 bool IsRegSeq = true;
465 unsigned NOps = N->getNumOperands();
466 for (unsigned i = 0; i < NOps; i++) {
467 // XXX: Why is this here?
468 if (isa<RegisterSDNode>(N->getOperand(i))) {
469 IsRegSeq = false;
470 break;
471 }
472 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
474 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476 }
477 if (NOps != NumVectorElts) {
478 // Fill in the missing undef elements if this was a scalar_to_vector.
479 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481 DL, EltVT);
482 for (unsigned i = NOps; i < NumVectorElts; ++i) {
483 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
485 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486 RegSeqArgs[1 + (2 * i) + 1] =
487 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488 }
489 }
490
491 if (!IsRegSeq)
492 SelectCode(N);
493 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494}
495
497 unsigned int Opc = N->getOpcode();
498 if (N->isMachineOpcode()) {
499 N->setNodeId(-1);
500 return; // Already selected.
501 }
502
503 // isa<MemSDNode> almost works but is slightly too permissive for some DS
504 // intrinsics.
505 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
506 N = glueCopyToM0LDSInit(N);
507 SelectCode(N);
508 return;
509 }
510
511 switch (Opc) {
512 default:
513 break;
514 // We are selecting i64 ADD here instead of custom lower it during
515 // DAG legalization, so we can fold some i64 ADDs used for address
516 // calculation into the LOAD and STORE instructions.
517 case ISD::ADDC:
518 case ISD::ADDE:
519 case ISD::SUBC:
520 case ISD::SUBE: {
521 if (N->getValueType(0) != MVT::i64)
522 break;
523
524 SelectADD_SUB_I64(N);
525 return;
526 }
527 case ISD::UADDO_CARRY:
528 case ISD::USUBO_CARRY:
529 if (N->getValueType(0) != MVT::i32)
530 break;
531
532 SelectAddcSubb(N);
533 return;
534 case ISD::UADDO:
535 case ISD::USUBO: {
536 SelectUADDO_USUBO(N);
537 return;
538 }
540 SelectFMUL_W_CHAIN(N);
541 return;
542 }
544 SelectFMA_W_CHAIN(N);
545 return;
546 }
547
549 case ISD::BUILD_VECTOR: {
550 EVT VT = N->getValueType(0);
551 unsigned NumVectorElts = VT.getVectorNumElements();
552 if (VT.getScalarSizeInBits() == 16) {
553 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
554 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
555 ReplaceNode(N, Packed);
556 return;
557 }
558 }
559
560 break;
561 }
562
563 assert(VT.getVectorElementType().bitsEq(MVT::i32));
564 unsigned RegClassID =
565 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
566 SelectBuildVector(N, RegClassID);
567 return;
568 }
569 case ISD::BUILD_PAIR: {
570 SDValue RC, SubReg0, SubReg1;
571 SDLoc DL(N);
572 if (N->getValueType(0) == MVT::i128) {
573 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
574 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
575 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
576 } else if (N->getValueType(0) == MVT::i64) {
577 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
578 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
579 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
580 } else {
581 llvm_unreachable("Unhandled value type for BUILD_PAIR");
582 }
583 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
584 N->getOperand(1), SubReg1 };
585 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
586 N->getValueType(0), Ops));
587 return;
588 }
589
590 case ISD::Constant:
591 case ISD::ConstantFP: {
592 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
593 break;
594
595 uint64_t Imm;
596 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
597 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
598 if (AMDGPU::isValid32BitLiteral(Imm, true))
599 break;
600 } else {
601 ConstantSDNode *C = cast<ConstantSDNode>(N);
602 Imm = C->getZExtValue();
603 if (AMDGPU::isValid32BitLiteral(Imm, false))
604 break;
605 }
606
607 SDLoc DL(N);
608 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
609 return;
610 }
612 case AMDGPUISD::BFE_U32: {
613 // There is a scalar version available, but unlike the vector version which
614 // has a separate operand for the offset and width, the scalar version packs
615 // the width and offset into a single operand. Try to move to the scalar
616 // version if the offsets are constant, so that we can try to keep extended
617 // loads of kernel arguments in SGPRs.
618
619 // TODO: Technically we could try to pattern match scalar bitshifts of
620 // dynamic values, but it's probably not useful.
621 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
622 if (!Offset)
623 break;
624
625 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
626 if (!Width)
627 break;
628
629 bool Signed = Opc == AMDGPUISD::BFE_I32;
630
631 uint32_t OffsetVal = Offset->getZExtValue();
632 uint32_t WidthVal = Width->getZExtValue();
633
634 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
635 WidthVal));
636 return;
637 }
639 SelectDIV_SCALE(N);
640 return;
641 }
644 SelectMAD_64_32(N);
645 return;
646 }
647 case ISD::SMUL_LOHI:
648 case ISD::UMUL_LOHI:
649 return SelectMUL_LOHI(N);
650 case ISD::CopyToReg: {
652 *static_cast<const SITargetLowering*>(getTargetLowering());
653 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
654 break;
655 }
656 case ISD::AND:
657 case ISD::SRL:
658 case ISD::SRA:
660 if (N->getValueType(0) != MVT::i32)
661 break;
662
663 SelectS_BFE(N);
664 return;
665 case ISD::BRCOND:
666 SelectBRCOND(N);
667 return;
668 case ISD::FP_EXTEND:
669 SelectFP_EXTEND(N);
670 return;
676 // Hack around using a legal type if f16 is illegal.
677 if (N->getValueType(0) == MVT::i32) {
678 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
679 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
680 { N->getOperand(0), N->getOperand(1) });
681 SelectCode(N);
682 return;
683 }
684
685 break;
686 }
688 SelectINTRINSIC_W_CHAIN(N);
689 return;
690 }
692 SelectINTRINSIC_WO_CHAIN(N);
693 return;
694 }
695 case ISD::INTRINSIC_VOID: {
696 SelectINTRINSIC_VOID(N);
697 return;
698 }
700 SelectWAVE_ADDRESS(N);
701 return;
702 }
703 case ISD::STACKRESTORE: {
704 SelectSTACKRESTORE(N);
705 return;
706 }
707 }
708
709 SelectCode(N);
710}
711
712bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
713 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
714 const Instruction *Term = BB->getTerminator();
715 return Term->getMetadata("amdgpu.uniform") ||
716 Term->getMetadata("structurizecfg.uniform");
717}
718
719bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
720 unsigned ShAmtBits) const {
721 assert(N->getOpcode() == ISD::AND);
722
723 const APInt &RHS = N->getConstantOperandAPInt(1);
724 if (RHS.countr_one() >= ShAmtBits)
725 return true;
726
727 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
728 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
729}
730
732 SDValue &N0, SDValue &N1) {
733 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
734 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
735 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
736 // (i64 (bitcast (v2i32 (build_vector
737 // (or (extract_vector_elt V, 0), OFFSET),
738 // (extract_vector_elt V, 1)))))
739 SDValue Lo = Addr.getOperand(0).getOperand(0);
740 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
741 SDValue BaseLo = Lo.getOperand(0);
742 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
743 // Check that split base (Lo and Hi) are extracted from the same one.
744 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
746 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
747 // Lo is statically extracted from index 0.
748 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
749 BaseLo.getConstantOperandVal(1) == 0 &&
750 // Hi is statically extracted from index 0.
751 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
752 BaseHi.getConstantOperandVal(1) == 1) {
753 N0 = BaseLo.getOperand(0).getOperand(0);
754 N1 = Lo.getOperand(1);
755 return true;
756 }
757 }
758 }
759 return false;
760}
761
762bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
763 SDValue &RHS) const {
765 LHS = Addr.getOperand(0);
766 RHS = Addr.getOperand(1);
767 return true;
768 }
769
770 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
771 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
772 return true;
773 }
774
775 return false;
776}
777
779 return "AMDGPU DAG->DAG Pattern Instruction Selection";
780}
781
784 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
785
789#ifdef EXPENSIVE_CHECKS
791 .getManager();
792 auto &F = MF.getFunction();
795 for (auto &L : LI.getLoopsInPreorder())
796 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
797#endif
798 return SelectionDAGISelPass::run(MF, MFAM);
799}
800
801//===----------------------------------------------------------------------===//
802// Complex Patterns
803//===----------------------------------------------------------------------===//
804
805bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
806 SDValue &Offset) {
807 return false;
808}
809
810bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
811 SDValue &Offset) {
813 SDLoc DL(Addr);
814
815 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
816 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
819 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
820 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
821 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
823 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
824 Base = Addr.getOperand(0);
825 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
826 } else {
827 Base = Addr;
828 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
829 }
830
831 return true;
832}
833
834SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
835 const SDLoc &DL) const {
837 AMDGPU::S_MOV_B32, DL, MVT::i32,
838 CurDAG->getTargetConstant(Val, DL, MVT::i32));
839 return SDValue(Mov, 0);
840}
841
842// FIXME: Should only handle uaddo_carry/usubo_carry
843void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
844 SDLoc DL(N);
845 SDValue LHS = N->getOperand(0);
846 SDValue RHS = N->getOperand(1);
847
848 unsigned Opcode = N->getOpcode();
849 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
850 bool ProduceCarry =
851 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
852 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
853
854 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
855 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
856
857 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
858 DL, MVT::i32, LHS, Sub0);
859 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
860 DL, MVT::i32, LHS, Sub1);
861
862 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
863 DL, MVT::i32, RHS, Sub0);
864 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
865 DL, MVT::i32, RHS, Sub1);
866
867 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
868
869 static const unsigned OpcMap[2][2][2] = {
870 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
871 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
872 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
873 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
874
875 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
876 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
877
878 SDNode *AddLo;
879 if (!ConsumeCarry) {
880 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
881 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
882 } else {
883 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
884 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
885 }
886 SDValue AddHiArgs[] = {
887 SDValue(Hi0, 0),
888 SDValue(Hi1, 0),
889 SDValue(AddLo, 1)
890 };
891 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
892
893 SDValue RegSequenceArgs[] = {
894 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
895 SDValue(AddLo,0),
896 Sub0,
897 SDValue(AddHi,0),
898 Sub1,
899 };
900 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
901 MVT::i64, RegSequenceArgs);
902
903 if (ProduceCarry) {
904 // Replace the carry-use
905 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
906 }
907
908 // Replace the remaining uses.
909 ReplaceNode(N, RegSequence);
910}
911
912void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
913 SDLoc DL(N);
914 SDValue LHS = N->getOperand(0);
915 SDValue RHS = N->getOperand(1);
916 SDValue CI = N->getOperand(2);
917
918 if (N->isDivergent()) {
919 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
920 : AMDGPU::V_SUBB_U32_e64;
922 N, Opc, N->getVTList(),
923 {LHS, RHS, CI,
924 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
925 } else {
926 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
927 : AMDGPU::S_SUB_CO_PSEUDO;
928 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
929 }
930}
931
932void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
933 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
934 // carry out despite the _i32 name. These were renamed in VI to _U32.
935 // FIXME: We should probably rename the opcodes here.
936 bool IsAdd = N->getOpcode() == ISD::UADDO;
937 bool IsVALU = N->isDivergent();
938
939 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
940 ++UI)
941 if (UI.getUse().getResNo() == 1) {
942 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
943 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
944 IsVALU = true;
945 break;
946 }
947 }
948
949 if (IsVALU) {
950 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
951
953 N, Opc, N->getVTList(),
954 {N->getOperand(0), N->getOperand(1),
955 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
956 } else {
957 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
958 : AMDGPU::S_USUBO_PSEUDO;
959
960 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
961 {N->getOperand(0), N->getOperand(1)});
962 }
963}
964
965void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
966 SDLoc SL(N);
967 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
968 SDValue Ops[10];
969
970 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
971 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
972 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
973 Ops[8] = N->getOperand(0);
974 Ops[9] = N->getOperand(4);
975
976 // If there are no source modifiers, prefer fmac over fma because it can use
977 // the smaller VOP2 encoding.
978 bool UseFMAC = Subtarget->hasDLInsts() &&
979 cast<ConstantSDNode>(Ops[0])->isZero() &&
980 cast<ConstantSDNode>(Ops[2])->isZero() &&
981 cast<ConstantSDNode>(Ops[4])->isZero();
982 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
983 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
984}
985
986void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
987 SDLoc SL(N);
988 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
989 SDValue Ops[8];
990
991 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
992 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
993 Ops[6] = N->getOperand(0);
994 Ops[7] = N->getOperand(3);
995
996 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
997}
998
999// We need to handle this here because tablegen doesn't support matching
1000// instructions with multiple outputs.
1001void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1002 SDLoc SL(N);
1003 EVT VT = N->getValueType(0);
1004
1005 assert(VT == MVT::f32 || VT == MVT::f64);
1006
1007 unsigned Opc
1008 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1009
1010 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1011 // omod
1012 SDValue Ops[8];
1013 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1014 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1015 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1016 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1017}
1018
1019// We need to handle this here because tablegen doesn't support matching
1020// instructions with multiple outputs.
1021void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1022 SDLoc SL(N);
1023 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1024 unsigned Opc;
1025 if (Subtarget->hasMADIntraFwdBug())
1026 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1027 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1028 else
1029 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1030
1031 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1032 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1033 Clamp };
1034 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1035}
1036
1037// We need to handle this here because tablegen doesn't support matching
1038// instructions with multiple outputs.
1039void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1040 SDLoc SL(N);
1041 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1042 unsigned Opc;
1043 if (Subtarget->hasMADIntraFwdBug())
1044 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1045 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1046 else
1047 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1048
1049 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1050 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1051 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1052 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1053 if (!SDValue(N, 0).use_empty()) {
1054 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1055 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1056 MVT::i32, SDValue(Mad, 0), Sub0);
1057 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1058 }
1059 if (!SDValue(N, 1).use_empty()) {
1060 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1061 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1062 MVT::i32, SDValue(Mad, 0), Sub1);
1063 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1064 }
1066}
1067
1068bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1069 if (!isUInt<16>(Offset))
1070 return false;
1071
1072 if (!Base || Subtarget->hasUsableDSOffset() ||
1073 Subtarget->unsafeDSOffsetFoldingEnabled())
1074 return true;
1075
1076 // On Southern Islands instruction with a negative base value and an offset
1077 // don't seem to work.
1078 return CurDAG->SignBitIsZero(Base);
1079}
1080
1081bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1082 SDValue &Offset) const {
1083 SDLoc DL(Addr);
1085 SDValue N0 = Addr.getOperand(0);
1086 SDValue N1 = Addr.getOperand(1);
1087 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1088 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1089 // (add n0, c0)
1090 Base = N0;
1091 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1092 return true;
1093 }
1094 } else if (Addr.getOpcode() == ISD::SUB) {
1095 // sub C, x -> add (sub 0, x), C
1096 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1097 int64_t ByteOffset = C->getSExtValue();
1098 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1099 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1100
1101 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1102 // the known bits in isDSOffsetLegal. We need to emit the selected node
1103 // here, so this is thrown away.
1104 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1105 Zero, Addr.getOperand(1));
1106
1107 if (isDSOffsetLegal(Sub, ByteOffset)) {
1109 Opnds.push_back(Zero);
1110 Opnds.push_back(Addr.getOperand(1));
1111
1112 // FIXME: Select to VOP3 version for with-carry.
1113 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1114 if (Subtarget->hasAddNoCarry()) {
1115 SubOp = AMDGPU::V_SUB_U32_e64;
1116 Opnds.push_back(
1117 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1118 }
1119
1120 MachineSDNode *MachineSub =
1121 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1122
1123 Base = SDValue(MachineSub, 0);
1124 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1125 return true;
1126 }
1127 }
1128 }
1129 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1130 // If we have a constant address, prefer to put the constant into the
1131 // offset. This can save moves to load the constant address since multiple
1132 // operations can share the zero base address register, and enables merging
1133 // into read2 / write2 instructions.
1134
1135 SDLoc DL(Addr);
1136
1137 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1138 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1139 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1140 DL, MVT::i32, Zero);
1141 Base = SDValue(MovZero, 0);
1142 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1143 return true;
1144 }
1145 }
1146
1147 // default case
1148 Base = Addr;
1149 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1150 return true;
1151}
1152
1153bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1154 unsigned Offset1,
1155 unsigned Size) const {
1156 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1157 return false;
1158 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1159 return false;
1160
1161 if (!Base || Subtarget->hasUsableDSOffset() ||
1162 Subtarget->unsafeDSOffsetFoldingEnabled())
1163 return true;
1164
1165 // On Southern Islands instruction with a negative base value and an offset
1166 // don't seem to work.
1167 return CurDAG->SignBitIsZero(Base);
1168}
1169
1170// Return whether the operation has NoUnsignedWrap property.
1172 return (Addr.getOpcode() == ISD::ADD &&
1173 Addr->getFlags().hasNoUnsignedWrap()) ||
1174 Addr->getOpcode() == ISD::OR;
1175}
1176
1177// Check that the base address of flat scratch load/store in the form of `base +
1178// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1179// requirement). We always treat the first operand as the base address here.
1180bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1182 return true;
1183
1184 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1185 // values.
1186 if (Subtarget->hasSignedScratchOffsets())
1187 return true;
1188
1189 auto LHS = Addr.getOperand(0);
1190 auto RHS = Addr.getOperand(1);
1191
1192 // If the immediate offset is negative and within certain range, the base
1193 // address cannot also be negative. If the base is also negative, the sum
1194 // would be either negative or much larger than the valid range of scratch
1195 // memory a thread can access.
1196 ConstantSDNode *ImmOp = nullptr;
1197 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1198 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1199 return true;
1200 }
1201
1202 return CurDAG->SignBitIsZero(LHS);
1203}
1204
1205// Check address value in SGPR/VGPR are legal for flat scratch in the form
1206// of: SGPR + VGPR.
1207bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1209 return true;
1210
1211 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1212 // values.
1213 if (Subtarget->hasSignedScratchOffsets())
1214 return true;
1215
1216 auto LHS = Addr.getOperand(0);
1217 auto RHS = Addr.getOperand(1);
1218 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1219}
1220
1221// Check address value in SGPR/VGPR are legal for flat scratch in the form
1222// of: SGPR + VGPR + Imm.
1223bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1224 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1225 // values.
1226 if (AMDGPU::isGFX12Plus(*Subtarget))
1227 return true;
1228
1229 auto Base = Addr.getOperand(0);
1230 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1231 // If the immediate offset is negative and within certain range, the base
1232 // address cannot also be negative. If the base is also negative, the sum
1233 // would be either negative or much larger than the valid range of scratch
1234 // memory a thread can access.
1235 if (isNoUnsignedWrap(Base) &&
1237 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1238 return true;
1239
1240 auto LHS = Base.getOperand(0);
1241 auto RHS = Base.getOperand(1);
1242 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1243}
1244
1245// TODO: If offset is too big, put low 16-bit into offset.
1246bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1247 SDValue &Offset0,
1248 SDValue &Offset1) const {
1249 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1250}
1251
1252bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1253 SDValue &Offset0,
1254 SDValue &Offset1) const {
1255 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1256}
1257
1258bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1259 SDValue &Offset0, SDValue &Offset1,
1260 unsigned Size) const {
1261 SDLoc DL(Addr);
1262
1264 SDValue N0 = Addr.getOperand(0);
1265 SDValue N1 = Addr.getOperand(1);
1266 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1267 unsigned OffsetValue0 = C1->getZExtValue();
1268 unsigned OffsetValue1 = OffsetValue0 + Size;
1269
1270 // (add n0, c0)
1271 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1272 Base = N0;
1273 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1274 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1275 return true;
1276 }
1277 } else if (Addr.getOpcode() == ISD::SUB) {
1278 // sub C, x -> add (sub 0, x), C
1279 if (const ConstantSDNode *C =
1280 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1281 unsigned OffsetValue0 = C->getZExtValue();
1282 unsigned OffsetValue1 = OffsetValue0 + Size;
1283
1284 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1285 SDLoc DL(Addr);
1286 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1287
1288 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1289 // the known bits in isDSOffsetLegal. We need to emit the selected node
1290 // here, so this is thrown away.
1291 SDValue Sub =
1292 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1293
1294 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1296 Opnds.push_back(Zero);
1297 Opnds.push_back(Addr.getOperand(1));
1298 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1299 if (Subtarget->hasAddNoCarry()) {
1300 SubOp = AMDGPU::V_SUB_U32_e64;
1301 Opnds.push_back(
1302 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1303 }
1304
1305 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1306 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1307
1308 Base = SDValue(MachineSub, 0);
1309 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1310 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1311 return true;
1312 }
1313 }
1314 }
1315 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1316 unsigned OffsetValue0 = CAddr->getZExtValue();
1317 unsigned OffsetValue1 = OffsetValue0 + Size;
1318
1319 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1320 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1321 MachineSDNode *MovZero =
1322 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1323 Base = SDValue(MovZero, 0);
1324 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1325 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1326 return true;
1327 }
1328 }
1329
1330 // default case
1331
1332 Base = Addr;
1333 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1334 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1335 return true;
1336}
1337
1338bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1339 SDValue &SOffset, SDValue &Offset,
1340 SDValue &Offen, SDValue &Idxen,
1341 SDValue &Addr64) const {
1342 // Subtarget prefers to use flat instruction
1343 // FIXME: This should be a pattern predicate and not reach here
1344 if (Subtarget->useFlatForGlobal())
1345 return false;
1346
1347 SDLoc DL(Addr);
1348
1349 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1350 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1351 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1352 SOffset = Subtarget->hasRestrictedSOffset()
1353 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1354 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1355
1356 ConstantSDNode *C1 = nullptr;
1357 SDValue N0 = Addr;
1359 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1360 if (isUInt<32>(C1->getZExtValue()))
1361 N0 = Addr.getOperand(0);
1362 else
1363 C1 = nullptr;
1364 }
1365
1366 if (N0.getOpcode() == ISD::ADD) {
1367 // (add N2, N3) -> addr64, or
1368 // (add (add N2, N3), C1) -> addr64
1369 SDValue N2 = N0.getOperand(0);
1370 SDValue N3 = N0.getOperand(1);
1371 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1372
1373 if (N2->isDivergent()) {
1374 if (N3->isDivergent()) {
1375 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1376 // addr64, and construct the resource from a 0 address.
1377 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1378 VAddr = N0;
1379 } else {
1380 // N2 is divergent, N3 is not.
1381 Ptr = N3;
1382 VAddr = N2;
1383 }
1384 } else {
1385 // N2 is not divergent.
1386 Ptr = N2;
1387 VAddr = N3;
1388 }
1389 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1390 } else if (N0->isDivergent()) {
1391 // N0 is divergent. Use it as the addr64, and construct the resource from a
1392 // 0 address.
1393 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1394 VAddr = N0;
1395 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1396 } else {
1397 // N0 -> offset, or
1398 // (N0 + C1) -> offset
1399 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1400 Ptr = N0;
1401 }
1402
1403 if (!C1) {
1404 // No offset.
1405 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1406 return true;
1407 }
1408
1409 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1410 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1411 // Legal offset for instruction.
1412 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1413 return true;
1414 }
1415
1416 // Illegal offset, store it in soffset.
1417 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1418 SOffset =
1420 AMDGPU::S_MOV_B32, DL, MVT::i32,
1421 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1422 0);
1423 return true;
1424}
1425
1426bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1427 SDValue &VAddr, SDValue &SOffset,
1428 SDValue &Offset) const {
1429 SDValue Ptr, Offen, Idxen, Addr64;
1430
1431 // addr64 bit was removed for volcanic islands.
1432 // FIXME: This should be a pattern predicate and not reach here
1433 if (!Subtarget->hasAddr64())
1434 return false;
1435
1436 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1437 return false;
1438
1439 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1440 if (C->getSExtValue()) {
1441 SDLoc DL(Addr);
1442
1443 const SITargetLowering& Lowering =
1444 *static_cast<const SITargetLowering*>(getTargetLowering());
1445
1446 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1447 return true;
1448 }
1449
1450 return false;
1451}
1452
1453std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1454 SDLoc DL(N);
1455
1456 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1457 SDValue TFI =
1458 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1459
1460 // We rebase the base address into an absolute stack address and hence
1461 // use constant 0 for soffset. This value must be retained until
1462 // frame elimination and eliminateFrameIndex will choose the appropriate
1463 // frame register if need be.
1464 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1465}
1466
1467bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1468 SDValue Addr, SDValue &Rsrc,
1469 SDValue &VAddr, SDValue &SOffset,
1470 SDValue &ImmOffset) const {
1471
1472 SDLoc DL(Addr);
1475
1476 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1477
1478 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1479 int64_t Imm = CAddr->getSExtValue();
1480 const int64_t NullPtr =
1482 // Don't fold null pointer.
1483 if (Imm != NullPtr) {
1484 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1485 SDValue HighBits =
1486 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1487 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1488 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1489 VAddr = SDValue(MovHighBits, 0);
1490
1491 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1492 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1493 return true;
1494 }
1495 }
1496
1498 // (add n0, c1)
1499
1500 SDValue N0 = Addr.getOperand(0);
1501 uint64_t C1 = Addr.getConstantOperandVal(1);
1502
1503 // Offsets in vaddr must be positive if range checking is enabled.
1504 //
1505 // The total computation of vaddr + soffset + offset must not overflow. If
1506 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1507 // overflowing.
1508 //
1509 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1510 // always perform a range check. If a negative vaddr base index was used,
1511 // this would fail the range check. The overall address computation would
1512 // compute a valid address, but this doesn't happen due to the range
1513 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1514 //
1515 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1516 // MUBUF vaddr, but not on older subtargets which can only do this if the
1517 // sign bit is known 0.
1518 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1519 if (TII->isLegalMUBUFImmOffset(C1) &&
1520 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1521 CurDAG->SignBitIsZero(N0))) {
1522 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1523 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1524 return true;
1525 }
1526 }
1527
1528 // (node)
1529 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1530 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1531 return true;
1532}
1533
1534static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1535 if (Val.getOpcode() != ISD::CopyFromReg)
1536 return false;
1537 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1538 if (!Reg.isPhysical())
1539 return false;
1540 auto RC = TRI.getPhysRegBaseClass(Reg);
1541 return RC && TRI.isSGPRClass(RC);
1542}
1543
1544bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1545 SDValue Addr,
1546 SDValue &SRsrc,
1547 SDValue &SOffset,
1548 SDValue &Offset) const {
1549 const SIRegisterInfo *TRI =
1550 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1551 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1554 SDLoc DL(Addr);
1555
1556 // CopyFromReg <sgpr>
1557 if (IsCopyFromSGPR(*TRI, Addr)) {
1558 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1559 SOffset = Addr;
1560 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1561 return true;
1562 }
1563
1564 ConstantSDNode *CAddr;
1565 if (Addr.getOpcode() == ISD::ADD) {
1566 // Add (CopyFromReg <sgpr>) <constant>
1567 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1568 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1569 return false;
1570 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1571 return false;
1572
1573 SOffset = Addr.getOperand(0);
1574 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1575 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1576 // <constant>
1577 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1578 } else {
1579 return false;
1580 }
1581
1582 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1583
1584 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1585 return true;
1586}
1587
1588bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1589 SDValue &SOffset, SDValue &Offset
1590 ) const {
1591 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1592 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1593
1594 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1595 return false;
1596
1597 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1598 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1599 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1600 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1601 APInt::getAllOnes(32).getZExtValue(); // Size
1602 SDLoc DL(Addr);
1603
1604 const SITargetLowering& Lowering =
1605 *static_cast<const SITargetLowering*>(getTargetLowering());
1606
1607 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1608 return true;
1609 }
1610 return false;
1611}
1612
1613bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1614 SDValue &SOffset) const {
1615 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1616 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1617 return true;
1618 }
1619
1620 SOffset = ByteOffsetNode;
1621 return true;
1622}
1623
1624// Find a load or store from corresponding pattern root.
1625// Roots may be build_vector, bitconvert or their combinations.
1628 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1629 return MN;
1630 assert(isa<BuildVectorSDNode>(N));
1631 for (SDValue V : N->op_values())
1632 if (MemSDNode *MN =
1633 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1634 return MN;
1635 llvm_unreachable("cannot find MemSDNode in the pattern!");
1636}
1637
1638bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1639 SDValue &VAddr, SDValue &Offset,
1640 uint64_t FlatVariant) const {
1641 int64_t OffsetVal = 0;
1642
1643 unsigned AS = findMemSDNode(N)->getAddressSpace();
1644
1645 bool CanHaveFlatSegmentOffsetBug =
1646 Subtarget->hasFlatSegmentOffsetBug() &&
1647 FlatVariant == SIInstrFlags::FLAT &&
1649
1650 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1651 SDValue N0, N1;
1652 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1653 (FlatVariant != SIInstrFlags::FlatScratch ||
1654 isFlatScratchBaseLegal(Addr))) {
1655 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1656
1657 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1658 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1659 Addr = N0;
1660 OffsetVal = COffsetVal;
1661 } else {
1662 // If the offset doesn't fit, put the low bits into the offset field and
1663 // add the rest.
1664 //
1665 // For a FLAT instruction the hardware decides whether to access
1666 // global/scratch/shared memory based on the high bits of vaddr,
1667 // ignoring the offset field, so we have to ensure that when we add
1668 // remainder to vaddr it still points into the same underlying object.
1669 // The easiest way to do that is to make sure that we split the offset
1670 // into two pieces that are both >= 0 or both <= 0.
1671
1672 SDLoc DL(N);
1673 uint64_t RemainderOffset;
1674
1675 std::tie(OffsetVal, RemainderOffset) =
1676 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1677
1678 SDValue AddOffsetLo =
1679 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1680 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1681
1682 if (Addr.getValueType().getSizeInBits() == 32) {
1684 Opnds.push_back(N0);
1685 Opnds.push_back(AddOffsetLo);
1686 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1687 if (Subtarget->hasAddNoCarry()) {
1688 AddOp = AMDGPU::V_ADD_U32_e64;
1689 Opnds.push_back(Clamp);
1690 }
1691 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1692 } else {
1693 // TODO: Should this try to use a scalar add pseudo if the base address
1694 // is uniform and saddr is usable?
1695 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1696 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1697
1698 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1699 DL, MVT::i32, N0, Sub0);
1700 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1701 DL, MVT::i32, N0, Sub1);
1702
1703 SDValue AddOffsetHi =
1704 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1705
1706 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1707
1708 SDNode *Add =
1709 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1710 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1711
1712 SDNode *Addc = CurDAG->getMachineNode(
1713 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1714 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1715
1716 SDValue RegSequenceArgs[] = {
1717 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1718 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1719
1720 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1721 MVT::i64, RegSequenceArgs),
1722 0);
1723 }
1724 }
1725 }
1726 }
1727
1728 VAddr = Addr;
1729 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1730 return true;
1731}
1732
1733bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1734 SDValue &VAddr,
1735 SDValue &Offset) const {
1736 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1737}
1738
1739bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1740 SDValue &VAddr,
1741 SDValue &Offset) const {
1742 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1743}
1744
1745bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1746 SDValue &VAddr,
1747 SDValue &Offset) const {
1748 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1750}
1751
1752// If this matches zero_extend i32:x, return x
1754 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1755 return SDValue();
1756
1757 SDValue ExtSrc = Op.getOperand(0);
1758 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1759}
1760
1761// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1762bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1763 SDValue Addr,
1764 SDValue &SAddr,
1765 SDValue &VOffset,
1766 SDValue &Offset) const {
1767 int64_t ImmOffset = 0;
1768
1769 // Match the immediate offset first, which canonically is moved as low as
1770 // possible.
1771
1772 SDValue LHS, RHS;
1773 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1774 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1775 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1776
1777 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1779 Addr = LHS;
1780 ImmOffset = COffsetVal;
1781 } else if (!LHS->isDivergent()) {
1782 if (COffsetVal > 0) {
1783 SDLoc SL(N);
1784 // saddr + large_offset -> saddr +
1785 // (voffset = large_offset & ~MaxOffset) +
1786 // (large_offset & MaxOffset);
1787 int64_t SplitImmOffset, RemainderOffset;
1788 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1790
1791 if (isUInt<32>(RemainderOffset)) {
1792 SDNode *VMov = CurDAG->getMachineNode(
1793 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1794 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1795 VOffset = SDValue(VMov, 0);
1796 SAddr = LHS;
1797 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1798 return true;
1799 }
1800 }
1801
1802 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1803 // is 1 we would need to perform 1 or 2 extra moves for each half of
1804 // the constant and it is better to do a scalar add and then issue a
1805 // single VALU instruction to materialize zero. Otherwise it is less
1806 // instructions to perform VALU adds with immediates or inline literals.
1807 unsigned NumLiterals =
1808 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1809 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1810 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1811 return false;
1812 }
1813 }
1814
1815 // Match the variable offset.
1816 if (Addr.getOpcode() == ISD::ADD) {
1817 LHS = Addr.getOperand(0);
1818 RHS = Addr.getOperand(1);
1819
1820 if (!LHS->isDivergent()) {
1821 // add (i64 sgpr), (zero_extend (i32 vgpr))
1822 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1823 SAddr = LHS;
1824 VOffset = ZextRHS;
1825 }
1826 }
1827
1828 if (!SAddr && !RHS->isDivergent()) {
1829 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1830 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1831 SAddr = RHS;
1832 VOffset = ZextLHS;
1833 }
1834 }
1835
1836 if (SAddr) {
1837 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1838 return true;
1839 }
1840 }
1841
1842 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1843 isa<ConstantSDNode>(Addr))
1844 return false;
1845
1846 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1847 // moves required to copy a 64-bit SGPR to VGPR.
1848 SAddr = Addr;
1849 SDNode *VMov =
1850 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1851 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1852 VOffset = SDValue(VMov, 0);
1853 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1854 return true;
1855}
1856
1858 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1859 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1860 } else if (SAddr.getOpcode() == ISD::ADD &&
1861 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1862 // Materialize this into a scalar move for scalar address to avoid
1863 // readfirstlane.
1864 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1865 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1866 FI->getValueType(0));
1867 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1868 MVT::i32, TFI, SAddr.getOperand(1)),
1869 0);
1870 }
1871
1872 return SAddr;
1873}
1874
1875// Match (32-bit SGPR base) + sext(imm offset)
1876bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1877 SDValue &SAddr,
1878 SDValue &Offset) const {
1879 if (Addr->isDivergent())
1880 return false;
1881
1882 SDLoc DL(Addr);
1883
1884 int64_t COffsetVal = 0;
1885
1886 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1887 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1888 SAddr = Addr.getOperand(0);
1889 } else {
1890 SAddr = Addr;
1891 }
1892
1893 SAddr = SelectSAddrFI(CurDAG, SAddr);
1894
1895 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1896
1897 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1899 int64_t SplitImmOffset, RemainderOffset;
1900 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1902
1903 COffsetVal = SplitImmOffset;
1904
1905 SDValue AddOffset =
1907 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1908 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1909 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1910 SAddr, AddOffset),
1911 0);
1912 }
1913
1914 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1915
1916 return true;
1917}
1918
1919// Check whether the flat scratch SVS swizzle bug affects this access.
1920bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1921 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1922 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1923 return false;
1924
1925 // The bug affects the swizzling of SVS accesses if there is any carry out
1926 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1927 // voffset to (soffset + inst_offset).
1928 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1930 /*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1931 CurDAG->computeKnownBits(SAddr),
1932 KnownBits::makeConstant(APInt(32, ImmOffset)));
1933 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1935 return (VMax & 3) + (SMax & 3) >= 4;
1936}
1937
1938bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1939 SDValue &VAddr, SDValue &SAddr,
1940 SDValue &Offset) const {
1941 int64_t ImmOffset = 0;
1942
1943 SDValue LHS, RHS;
1944 SDValue OrigAddr = Addr;
1945 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1946 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1947 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1948
1949 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1950 Addr = LHS;
1951 ImmOffset = COffsetVal;
1952 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1953 SDLoc SL(N);
1954 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1955 // (large_offset & MaxOffset);
1956 int64_t SplitImmOffset, RemainderOffset;
1957 std::tie(SplitImmOffset, RemainderOffset)
1958 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1959
1960 if (isUInt<32>(RemainderOffset)) {
1961 SDNode *VMov = CurDAG->getMachineNode(
1962 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1963 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1964 VAddr = SDValue(VMov, 0);
1965 SAddr = LHS;
1966 if (!isFlatScratchBaseLegal(Addr))
1967 return false;
1968 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1969 return false;
1970 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1971 return true;
1972 }
1973 }
1974 }
1975
1976 if (Addr.getOpcode() != ISD::ADD)
1977 return false;
1978
1979 LHS = Addr.getOperand(0);
1980 RHS = Addr.getOperand(1);
1981
1982 if (!LHS->isDivergent() && RHS->isDivergent()) {
1983 SAddr = LHS;
1984 VAddr = RHS;
1985 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1986 SAddr = RHS;
1987 VAddr = LHS;
1988 } else {
1989 return false;
1990 }
1991
1992 if (OrigAddr != Addr) {
1993 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1994 return false;
1995 } else {
1996 if (!isFlatScratchBaseLegalSV(OrigAddr))
1997 return false;
1998 }
1999
2000 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2001 return false;
2002 SAddr = SelectSAddrFI(CurDAG, SAddr);
2003 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
2004 return true;
2005}
2006
2007// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2008// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2009// Handle the case where the Immediate Offset + SOffset is negative.
2010bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2011 bool Imm32Only,
2012 bool IsBuffer,
2013 int64_t ImmOffset) const {
2014 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2015 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2016 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2017 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2018 return false;
2019 }
2020
2021 return true;
2022}
2023
2024// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2025// not null) offset. If Imm32Only is true, match only 32-bit immediate
2026// offsets available on CI.
2027bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2028 SDValue *SOffset, SDValue *Offset,
2029 bool Imm32Only, bool IsBuffer,
2030 bool HasSOffset,
2031 int64_t ImmOffset) const {
2032 assert((!SOffset || !Offset) &&
2033 "Cannot match both soffset and offset at the same time!");
2034
2035 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2036 if (!C) {
2037 if (!SOffset)
2038 return false;
2039
2040 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2041 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2042 *SOffset = ByteOffsetNode;
2043 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2044 ImmOffset);
2045 }
2046 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2047 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2048 *SOffset = ByteOffsetNode.getOperand(0);
2049 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2050 ImmOffset);
2051 }
2052 }
2053 return false;
2054 }
2055
2056 SDLoc SL(ByteOffsetNode);
2057
2058 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2059 // offset for S_BUFFER instructions is unsigned.
2060 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2061 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2062 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2063 if (EncodedOffset && Offset && !Imm32Only) {
2064 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2065 return true;
2066 }
2067
2068 // SGPR and literal offsets are unsigned.
2069 if (ByteOffset < 0)
2070 return false;
2071
2072 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2073 if (EncodedOffset && Offset && Imm32Only) {
2074 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2075 return true;
2076 }
2077
2078 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2079 return false;
2080
2081 if (SOffset) {
2082 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2083 *SOffset = SDValue(
2084 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2085 return true;
2086 }
2087
2088 return false;
2089}
2090
2091SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2092 if (Addr.getValueType() != MVT::i32)
2093 return Addr;
2094
2095 // Zero-extend a 32-bit address.
2096 SDLoc SL(Addr);
2097
2100 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2101 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2102
2103 const SDValue Ops[] = {
2104 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2105 Addr,
2106 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2107 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2108 0),
2109 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2110 };
2111
2112 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2113 Ops), 0);
2114}
2115
2116// Match a base and an immediate (if Offset is not null) or an SGPR (if
2117// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2118// true, match only 32-bit immediate offsets available on CI.
2119bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2120 SDValue *SOffset, SDValue *Offset,
2121 bool Imm32Only, bool IsBuffer,
2122 bool HasSOffset,
2123 int64_t ImmOffset) const {
2124 if (SOffset && Offset) {
2125 assert(!Imm32Only && !IsBuffer);
2126 SDValue B;
2127
2128 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2129 return false;
2130
2131 int64_t ImmOff = 0;
2132 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2133 ImmOff = C->getSExtValue();
2134
2135 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2136 ImmOff);
2137 }
2138
2139 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2140 // wraparound, because s_load instructions perform the addition in 64 bits.
2141 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2142 !Addr->getFlags().hasNoUnsignedWrap())
2143 return false;
2144
2145 SDValue N0, N1;
2146 // Extract the base and offset if possible.
2147 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2148 N0 = Addr.getOperand(0);
2149 N1 = Addr.getOperand(1);
2150 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2151 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2152 }
2153 if (!N0 || !N1)
2154 return false;
2155
2156 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2157 ImmOffset)) {
2158 SBase = N0;
2159 return true;
2160 }
2161 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2162 ImmOffset)) {
2163 SBase = N1;
2164 return true;
2165 }
2166 return false;
2167}
2168
2169bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2170 SDValue *SOffset, SDValue *Offset,
2171 bool Imm32Only) const {
2172 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2173 SBase = Expand32BitAddress(SBase);
2174 return true;
2175 }
2176
2177 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2178 SBase = Expand32BitAddress(Addr);
2179 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2180 return true;
2181 }
2182
2183 return false;
2184}
2185
2186bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2187 SDValue &Offset) const {
2188 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2189}
2190
2191bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2192 SDValue &Offset) const {
2194 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2195 /* Imm32Only */ true);
2196}
2197
2198bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2199 SDValue &SOffset) const {
2200 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2201}
2202
2203bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2204 SDValue &SOffset,
2205 SDValue &Offset) const {
2206 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2207}
2208
2209bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2210 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2211 /* Imm32Only */ false, /* IsBuffer */ true);
2212}
2213
2214bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2215 SDValue &Offset) const {
2217 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2218 /* Imm32Only */ true, /* IsBuffer */ true);
2219}
2220
2221bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2222 SDValue &Offset) const {
2223 // Match the (soffset + offset) pair as a 32-bit register base and
2224 // an immediate offset.
2225 return N.getValueType() == MVT::i32 &&
2226 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2227 &Offset, /* Imm32Only */ false,
2228 /* IsBuffer */ true);
2229}
2230
2231bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2232 SDValue &Base,
2233 SDValue &Offset) const {
2234 SDLoc DL(Index);
2235
2237 SDValue N0 = Index.getOperand(0);
2238 SDValue N1 = Index.getOperand(1);
2239 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2240
2241 // (add n0, c0)
2242 // Don't peel off the offset (c0) if doing so could possibly lead
2243 // the base (n0) to be negative.
2244 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2245 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2246 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2247 Base = N0;
2248 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2249 return true;
2250 }
2251 }
2252
2253 if (isa<ConstantSDNode>(Index))
2254 return false;
2255
2256 Base = Index;
2257 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2258 return true;
2259}
2260
2261SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2262 SDValue Val, uint32_t Offset,
2263 uint32_t Width) {
2264 if (Val->isDivergent()) {
2265 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2267 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2268
2269 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2270 }
2271 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2272 // Transformation function, pack the offset and width of a BFE into
2273 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2274 // source, bits [5:0] contain the offset and bits [22:16] the width.
2275 uint32_t PackedVal = Offset | (Width << 16);
2276 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2277
2278 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2279}
2280
2281void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2282 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2283 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2284 // Predicate: 0 < b <= c < 32
2285
2286 const SDValue &Shl = N->getOperand(0);
2287 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2288 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2289
2290 if (B && C) {
2291 uint32_t BVal = B->getZExtValue();
2292 uint32_t CVal = C->getZExtValue();
2293
2294 if (0 < BVal && BVal <= CVal && CVal < 32) {
2295 bool Signed = N->getOpcode() == ISD::SRA;
2296 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2297 32 - CVal));
2298 return;
2299 }
2300 }
2301 SelectCode(N);
2302}
2303
2304void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2305 switch (N->getOpcode()) {
2306 case ISD::AND:
2307 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2308 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2309 // Predicate: isMask(mask)
2310 const SDValue &Srl = N->getOperand(0);
2311 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2312 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2313
2314 if (Shift && Mask) {
2315 uint32_t ShiftVal = Shift->getZExtValue();
2316 uint32_t MaskVal = Mask->getZExtValue();
2317
2318 if (isMask_32(MaskVal)) {
2319 uint32_t WidthVal = llvm::popcount(MaskVal);
2320 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2321 WidthVal));
2322 return;
2323 }
2324 }
2325 }
2326 break;
2327 case ISD::SRL:
2328 if (N->getOperand(0).getOpcode() == ISD::AND) {
2329 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2330 // Predicate: isMask(mask >> b)
2331 const SDValue &And = N->getOperand(0);
2332 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2333 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2334
2335 if (Shift && Mask) {
2336 uint32_t ShiftVal = Shift->getZExtValue();
2337 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2338
2339 if (isMask_32(MaskVal)) {
2340 uint32_t WidthVal = llvm::popcount(MaskVal);
2341 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2342 WidthVal));
2343 return;
2344 }
2345 }
2346 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2347 SelectS_BFEFromShifts(N);
2348 return;
2349 }
2350 break;
2351 case ISD::SRA:
2352 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2353 SelectS_BFEFromShifts(N);
2354 return;
2355 }
2356 break;
2357
2359 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2360 SDValue Src = N->getOperand(0);
2361 if (Src.getOpcode() != ISD::SRL)
2362 break;
2363
2364 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2365 if (!Amt)
2366 break;
2367
2368 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2369 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2370 Amt->getZExtValue(), Width));
2371 return;
2372 }
2373 }
2374
2375 SelectCode(N);
2376}
2377
2378bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2379 assert(N->getOpcode() == ISD::BRCOND);
2380 if (!N->hasOneUse())
2381 return false;
2382
2383 SDValue Cond = N->getOperand(1);
2384 if (Cond.getOpcode() == ISD::CopyToReg)
2385 Cond = Cond.getOperand(2);
2386
2387 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2388 return false;
2389
2390 MVT VT = Cond.getOperand(0).getSimpleValueType();
2391 if (VT == MVT::i32)
2392 return true;
2393
2394 if (VT == MVT::i64) {
2395 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2396
2397 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2398 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2399 }
2400
2401 return false;
2402}
2403
2404static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2405 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2406 // Special case for amdgcn.ballot:
2407 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2408 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2409 // =>
2410 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2411 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2412 // Cond becomes a i(WaveSize) full mask value.
2413 // Note that ballot doesn't use SETEQ condition but its easy to support it
2414 // here for completeness, so in this case Negate is set true on return.
2415 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2416 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2417 isNullConstant(VCMP.getOperand(1))) {
2418
2419 auto Cond = VCMP.getOperand(0);
2420 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2421 Cond = Cond.getOperand(0);
2422
2423 if (isBoolSGPR(Cond)) {
2424 Negate = VCMP_CC == ISD::SETEQ;
2425 return Cond;
2426 }
2427 }
2428 return SDValue();
2429}
2430
2431void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2432 SDValue Cond = N->getOperand(1);
2433
2434 if (Cond.isUndef()) {
2435 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2436 N->getOperand(2), N->getOperand(0));
2437 return;
2438 }
2439
2440 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2441 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2442
2443 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2444 bool AndExec = !UseSCCBr;
2445 bool Negate = false;
2446
2447 if (Cond.getOpcode() == ISD::SETCC &&
2448 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2449 SDValue VCMP = Cond->getOperand(0);
2450 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2451 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2452 isNullConstant(Cond->getOperand(1)) &&
2453 // We may encounter ballot.i64 in wave32 mode on -O0.
2454 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2455 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2456 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2457 // BRCOND i1 %C, %BB
2458 // =>
2459 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2460 // VCC = COPY i(WaveSize) %VCMP
2461 // S_CBRANCH_VCCNZ/VCCZ %BB
2462 Negate = CC == ISD::SETEQ;
2463 bool NegatedBallot = false;
2464 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2465 Cond = BallotCond;
2466 UseSCCBr = !BallotCond->isDivergent();
2467 Negate = Negate ^ NegatedBallot;
2468 } else {
2469 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2470 // selected as V_CMP, but this may change for uniform condition.
2471 Cond = VCMP;
2472 UseSCCBr = false;
2473 }
2474 }
2475 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2476 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2477 // used.
2478 AndExec = false;
2479 }
2480
2481 unsigned BrOp =
2482 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2483 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2484 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2485 SDLoc SL(N);
2486
2487 if (AndExec) {
2488 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2489 // analyzed what generates the vcc value, so we do not know whether vcc
2490 // bits for disabled lanes are 0. Thus we need to mask out bits for
2491 // disabled lanes.
2492 //
2493 // For the case that we select S_CBRANCH_SCC1 and it gets
2494 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2495 // SIInstrInfo::moveToVALU which inserts the S_AND).
2496 //
2497 // We could add an analysis of what generates the vcc value here and omit
2498 // the S_AND when is unnecessary. But it would be better to add a separate
2499 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2500 // catches both cases.
2501 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2502 : AMDGPU::S_AND_B64,
2503 SL, MVT::i1,
2504 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2505 : AMDGPU::EXEC,
2506 MVT::i1),
2507 Cond),
2508 0);
2509 }
2510
2511 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2512 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2513 N->getOperand(2), // Basic Block
2514 VCC.getValue(0));
2515}
2516
2517void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2518 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2519 !N->isDivergent()) {
2520 SDValue Src = N->getOperand(0);
2521 if (Src.getValueType() == MVT::f16) {
2522 if (isExtractHiElt(Src, Src)) {
2523 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2524 {Src});
2525 return;
2526 }
2527 }
2528 }
2529
2530 SelectCode(N);
2531}
2532
2533void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2534 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2535 // be copied to an SGPR with readfirstlane.
2536 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2537 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2538
2539 SDValue Chain = N->getOperand(0);
2540 SDValue Ptr = N->getOperand(2);
2541 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2542 MachineMemOperand *MMO = M->getMemOperand();
2543 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2544
2547 SDValue PtrBase = Ptr.getOperand(0);
2548 SDValue PtrOffset = Ptr.getOperand(1);
2549
2550 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2551 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2552 N = glueCopyToM0(N, PtrBase);
2553 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2554 }
2555 }
2556
2557 if (!Offset) {
2558 N = glueCopyToM0(N, Ptr);
2559 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2560 }
2561
2562 SDValue Ops[] = {
2563 Offset,
2564 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2565 Chain,
2566 N->getOperand(N->getNumOperands() - 1) // New glue
2567 };
2568
2569 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2570 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2571}
2572
2573// We need to handle this here because tablegen doesn't support matching
2574// instructions with multiple outputs.
2575void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2576 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2577 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2578 N->getOperand(5), N->getOperand(0)};
2579
2580 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2581 MachineMemOperand *MMO = M->getMemOperand();
2582 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2583 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2584}
2585
2586static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2587 switch (IntrID) {
2588 case Intrinsic::amdgcn_ds_gws_init:
2589 return AMDGPU::DS_GWS_INIT;
2590 case Intrinsic::amdgcn_ds_gws_barrier:
2591 return AMDGPU::DS_GWS_BARRIER;
2592 case Intrinsic::amdgcn_ds_gws_sema_v:
2593 return AMDGPU::DS_GWS_SEMA_V;
2594 case Intrinsic::amdgcn_ds_gws_sema_br:
2595 return AMDGPU::DS_GWS_SEMA_BR;
2596 case Intrinsic::amdgcn_ds_gws_sema_p:
2597 return AMDGPU::DS_GWS_SEMA_P;
2598 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2599 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2600 default:
2601 llvm_unreachable("not a gws intrinsic");
2602 }
2603}
2604
2605void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2606 if (!Subtarget->hasGWS() ||
2607 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2608 !Subtarget->hasGWSSemaReleaseAll())) {
2609 // Let this error.
2610 SelectCode(N);
2611 return;
2612 }
2613
2614 // Chain, intrinsic ID, vsrc, offset
2615 const bool HasVSrc = N->getNumOperands() == 4;
2616 assert(HasVSrc || N->getNumOperands() == 3);
2617
2618 SDLoc SL(N);
2619 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2620 int ImmOffset = 0;
2621 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2622 MachineMemOperand *MMO = M->getMemOperand();
2623
2624 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2625 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2626
2627 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2628 // offset field) % 64. Some versions of the programming guide omit the m0
2629 // part, or claim it's from offset 0.
2630 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2631 // If we have a constant offset, try to use the 0 in m0 as the base.
2632 // TODO: Look into changing the default m0 initialization value. If the
2633 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2634 // the immediate offset.
2635 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2636 ImmOffset = ConstOffset->getZExtValue();
2637 } else {
2638 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2639 ImmOffset = BaseOffset.getConstantOperandVal(1);
2640 BaseOffset = BaseOffset.getOperand(0);
2641 }
2642
2643 // Prefer to do the shift in an SGPR since it should be possible to use m0
2644 // as the result directly. If it's already an SGPR, it will be eliminated
2645 // later.
2646 SDNode *SGPROffset
2647 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2648 BaseOffset);
2649 // Shift to offset in m0
2650 SDNode *M0Base
2651 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2652 SDValue(SGPROffset, 0),
2653 CurDAG->getTargetConstant(16, SL, MVT::i32));
2654 glueCopyToM0(N, SDValue(M0Base, 0));
2655 }
2656
2657 SDValue Chain = N->getOperand(0);
2658 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2659
2660 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2662 if (HasVSrc)
2663 Ops.push_back(N->getOperand(2));
2664 Ops.push_back(OffsetField);
2665 Ops.push_back(Chain);
2666
2667 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2668 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2669}
2670
2671void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2672 if (Subtarget->getLDSBankCount() != 16) {
2673 // This is a single instruction with a pattern.
2674 SelectCode(N);
2675 return;
2676 }
2677
2678 SDLoc DL(N);
2679
2680 // This requires 2 instructions. It is possible to write a pattern to support
2681 // this, but the generated isel emitter doesn't correctly deal with multiple
2682 // output instructions using the same physical register input. The copy to m0
2683 // is incorrectly placed before the second instruction.
2684 //
2685 // TODO: Match source modifiers.
2686 //
2687 // def : Pat <
2688 // (int_amdgcn_interp_p1_f16
2689 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2690 // (i32 timm:$attrchan), (i32 timm:$attr),
2691 // (i1 timm:$high), M0),
2692 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2693 // timm:$attrchan, 0,
2694 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2695 // let Predicates = [has16BankLDS];
2696 // }
2697
2698 // 16 bank LDS
2699 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2700 N->getOperand(5), SDValue());
2701
2702 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2703
2704 SDNode *InterpMov =
2705 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2706 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2707 N->getOperand(3), // Attr
2708 N->getOperand(2), // Attrchan
2709 ToM0.getValue(1) // In glue
2710 });
2711
2712 SDNode *InterpP1LV =
2713 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2714 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2715 N->getOperand(1), // Src0
2716 N->getOperand(3), // Attr
2717 N->getOperand(2), // Attrchan
2718 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2719 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2720 N->getOperand(4), // high
2721 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2722 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2723 SDValue(InterpMov, 1)
2724 });
2725
2726 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2727}
2728
2729void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2730 unsigned IntrID = N->getConstantOperandVal(1);
2731 switch (IntrID) {
2732 case Intrinsic::amdgcn_ds_append:
2733 case Intrinsic::amdgcn_ds_consume: {
2734 if (N->getValueType(0) != MVT::i32)
2735 break;
2736 SelectDSAppendConsume(N, IntrID);
2737 return;
2738 }
2739 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2740 SelectDSBvhStackIntrinsic(N);
2741 return;
2742 }
2743
2744 SelectCode(N);
2745}
2746
2747void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2748 unsigned IntrID = N->getConstantOperandVal(0);
2749 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2750 SDNode *ConvGlueNode = N->getGluedNode();
2751 if (ConvGlueNode) {
2752 // FIXME: Possibly iterate over multiple glue nodes?
2753 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2754 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2755 ConvGlueNode =
2756 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2757 MVT::Glue, SDValue(ConvGlueNode, 0));
2758 } else {
2759 ConvGlueNode = nullptr;
2760 }
2761 switch (IntrID) {
2762 case Intrinsic::amdgcn_wqm:
2763 Opcode = AMDGPU::WQM;
2764 break;
2765 case Intrinsic::amdgcn_softwqm:
2766 Opcode = AMDGPU::SOFT_WQM;
2767 break;
2768 case Intrinsic::amdgcn_wwm:
2769 case Intrinsic::amdgcn_strict_wwm:
2770 Opcode = AMDGPU::STRICT_WWM;
2771 break;
2772 case Intrinsic::amdgcn_strict_wqm:
2773 Opcode = AMDGPU::STRICT_WQM;
2774 break;
2775 case Intrinsic::amdgcn_interp_p1_f16:
2776 SelectInterpP1F16(N);
2777 return;
2778 default:
2779 SelectCode(N);
2780 break;
2781 }
2782
2783 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2784 SDValue Src = N->getOperand(1);
2785 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2786 }
2787
2788 if (ConvGlueNode) {
2789 SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2790 NewOps.push_back(SDValue(ConvGlueNode, 0));
2791 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2792 }
2793}
2794
2795void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2796 unsigned IntrID = N->getConstantOperandVal(1);
2797 switch (IntrID) {
2798 case Intrinsic::amdgcn_ds_gws_init:
2799 case Intrinsic::amdgcn_ds_gws_barrier:
2800 case Intrinsic::amdgcn_ds_gws_sema_v:
2801 case Intrinsic::amdgcn_ds_gws_sema_br:
2802 case Intrinsic::amdgcn_ds_gws_sema_p:
2803 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2804 SelectDS_GWS(N, IntrID);
2805 return;
2806 default:
2807 break;
2808 }
2809
2810 SelectCode(N);
2811}
2812
2813void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2814 SDValue Log2WaveSize =
2815 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2816 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2817 {N->getOperand(0), Log2WaveSize});
2818}
2819
2820void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2821 SDValue SrcVal = N->getOperand(1);
2822 if (SrcVal.getValueType() != MVT::i32) {
2823 SelectCode(N); // Emit default error
2824 return;
2825 }
2826
2827 SDValue CopyVal;
2829 SDLoc SL(N);
2830
2831 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2832 CopyVal = SrcVal.getOperand(0);
2833 } else {
2834 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2835 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2836
2837 if (N->isDivergent()) {
2838 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2839 MVT::i32, SrcVal),
2840 0);
2841 }
2842
2843 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2844 {SrcVal, Log2WaveSize}),
2845 0);
2846 }
2847
2848 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2849 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2850}
2851
2852bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2853 unsigned &Mods,
2854 bool IsCanonicalizing,
2855 bool AllowAbs) const {
2856 Mods = SISrcMods::NONE;
2857 Src = In;
2858
2859 if (Src.getOpcode() == ISD::FNEG) {
2860 Mods |= SISrcMods::NEG;
2861 Src = Src.getOperand(0);
2862 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2863 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2864 // denormal mode, but we're implicitly canonicalizing in a source operand.
2865 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2866 if (LHS && LHS->isZero()) {
2867 Mods |= SISrcMods::NEG;
2868 Src = Src.getOperand(1);
2869 }
2870 }
2871
2872 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2873 Mods |= SISrcMods::ABS;
2874 Src = Src.getOperand(0);
2875 }
2876
2877 return true;
2878}
2879
2880bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2881 SDValue &SrcMods) const {
2882 unsigned Mods;
2883 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2884 /*AllowAbs=*/true)) {
2885 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2886 return true;
2887 }
2888
2889 return false;
2890}
2891
2892bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2893 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2894 unsigned Mods;
2895 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2896 /*AllowAbs=*/true)) {
2897 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2898 return true;
2899 }
2900
2901 return false;
2902}
2903
2904bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2905 SDValue &SrcMods) const {
2906 unsigned Mods;
2907 if (SelectVOP3ModsImpl(In, Src, Mods,
2908 /*IsCanonicalizing=*/true,
2909 /*AllowAbs=*/false)) {
2910 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2911 return true;
2912 }
2913
2914 return false;
2915}
2916
2917bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2918 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2919 return false;
2920
2921 Src = In;
2922 return true;
2923}
2924
2925bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2926 SDValue &SrcMods,
2927 bool OpSel) const {
2928 unsigned Mods;
2929 if (SelectVOP3ModsImpl(In, Src, Mods,
2930 /*IsCanonicalizing=*/true,
2931 /*AllowAbs=*/false)) {
2932 if (OpSel)
2933 Mods |= SISrcMods::OP_SEL_0;
2934 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2935 return true;
2936 }
2937
2938 return false;
2939}
2940
2941bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2942 SDValue &SrcMods) const {
2943 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2944}
2945
2946bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2947 SDValue &SrcMods) const {
2948 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2949}
2950
2951bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2952 SDValue &SrcMods, SDValue &Clamp,
2953 SDValue &Omod) const {
2954 SDLoc DL(In);
2955 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2956 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2957
2958 return SelectVOP3Mods(In, Src, SrcMods);
2959}
2960
2961bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2962 SDValue &SrcMods, SDValue &Clamp,
2963 SDValue &Omod) const {
2964 SDLoc DL(In);
2965 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2966 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2967
2968 return SelectVOP3BMods(In, Src, SrcMods);
2969}
2970
2971bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2972 SDValue &Clamp, SDValue &Omod) const {
2973 Src = In;
2974
2975 SDLoc DL(In);
2976 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2977 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2978
2979 return true;
2980}
2981
2982bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2983 SDValue &SrcMods, bool IsDOT) const {
2984 unsigned Mods = SISrcMods::NONE;
2985 Src = In;
2986
2987 // TODO: Handle G_FSUB 0 as fneg
2988 if (Src.getOpcode() == ISD::FNEG) {
2990 Src = Src.getOperand(0);
2991 }
2992
2993 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2994 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2995 unsigned VecMods = Mods;
2996
2997 SDValue Lo = stripBitcast(Src.getOperand(0));
2998 SDValue Hi = stripBitcast(Src.getOperand(1));
2999
3000 if (Lo.getOpcode() == ISD::FNEG) {
3001 Lo = stripBitcast(Lo.getOperand(0));
3002 Mods ^= SISrcMods::NEG;
3003 }
3004
3005 if (Hi.getOpcode() == ISD::FNEG) {
3006 Hi = stripBitcast(Hi.getOperand(0));
3007 Mods ^= SISrcMods::NEG_HI;
3008 }
3009
3010 if (isExtractHiElt(Lo, Lo))
3011 Mods |= SISrcMods::OP_SEL_0;
3012
3013 if (isExtractHiElt(Hi, Hi))
3014 Mods |= SISrcMods::OP_SEL_1;
3015
3016 unsigned VecSize = Src.getValueSizeInBits();
3017 Lo = stripExtractLoElt(Lo);
3018 Hi = stripExtractLoElt(Hi);
3019
3020 if (Lo.getValueSizeInBits() > VecSize) {
3022 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3023 MVT::getIntegerVT(VecSize), Lo);
3024 }
3025
3026 if (Hi.getValueSizeInBits() > VecSize) {
3028 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3029 MVT::getIntegerVT(VecSize), Hi);
3030 }
3031
3032 assert(Lo.getValueSizeInBits() <= VecSize &&
3033 Hi.getValueSizeInBits() <= VecSize);
3034
3035 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3036 // Really a scalar input. Just select from the low half of the register to
3037 // avoid packing.
3038
3039 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3040 Src = Lo;
3041 } else {
3042 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3043
3044 SDLoc SL(In);
3046 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3047 Lo.getValueType()), 0);
3048 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3049 : AMDGPU::SReg_64RegClassID;
3050 const SDValue Ops[] = {
3051 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3052 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3053 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3054
3055 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3056 Src.getValueType(), Ops), 0);
3057 }
3058 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3059 return true;
3060 }
3061
3062 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3063 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3064 .bitcastToAPInt().getZExtValue();
3065 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3066 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3067 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3068 return true;
3069 }
3070 }
3071
3072 Mods = VecMods;
3073 }
3074
3075 // Packed instructions do not have abs modifiers.
3076 Mods |= SISrcMods::OP_SEL_1;
3077
3078 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3079 return true;
3080}
3081
3082bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3083 SDValue &SrcMods) const {
3084 return SelectVOP3PMods(In, Src, SrcMods, true);
3085}
3086
3087bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3088 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3089 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3090 // 1 promotes packed values to signed, 0 treats them as unsigned.
3091 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3092
3093 unsigned Mods = SISrcMods::OP_SEL_1;
3094 unsigned SrcSign = C->getZExtValue();
3095 if (SrcSign == 1)
3096 Mods ^= SISrcMods::NEG;
3097
3098 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3099 return true;
3100}
3101
3102bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3103 SDValue &Src) const {
3104 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3105 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3106
3107 unsigned Mods = SISrcMods::OP_SEL_1;
3108 unsigned SrcVal = C->getZExtValue();
3109 if (SrcVal == 1)
3110 Mods |= SISrcMods::OP_SEL_0;
3111
3112 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3113 return true;
3114}
3115
3117 llvm::SelectionDAG *CurDAG,
3118 const SDLoc &DL) {
3119 unsigned DstRegClass;
3120 EVT DstTy;
3121 switch (Elts.size()) {
3122 case 8:
3123 DstRegClass = AMDGPU::VReg_256RegClassID;
3124 DstTy = MVT::v8i32;
3125 break;
3126 case 4:
3127 DstRegClass = AMDGPU::VReg_128RegClassID;
3128 DstTy = MVT::v4i32;
3129 break;
3130 case 2:
3131 DstRegClass = AMDGPU::VReg_64RegClassID;
3132 DstTy = MVT::v2i32;
3133 break;
3134 default:
3135 llvm_unreachable("unhandled Reg sequence size");
3136 }
3137
3139 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3140 for (unsigned i = 0; i < Elts.size(); ++i) {
3141 Ops.push_back(Elts[i]);
3142 Ops.push_back(CurDAG->getTargetConstant(
3144 }
3145 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3146}
3147
3149 llvm::SelectionDAG *CurDAG,
3150 const SDLoc &DL) {
3151 SmallVector<SDValue, 8> PackedElts;
3152 assert("unhandled Reg sequence size" &&
3153 (Elts.size() == 8 || Elts.size() == 16));
3154
3155 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3156 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3157 for (unsigned i = 0; i < Elts.size(); i += 2) {
3158 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3159 SDValue HiSrc;
3160 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3161 PackedElts.push_back(HiSrc);
3162 } else {
3163 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3164 MachineSDNode *Packed =
3165 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3166 {Elts[i + 1], Elts[i], PackLoLo});
3167 PackedElts.push_back(SDValue(Packed, 0));
3168 }
3169 }
3170
3171 return buildRegSequence32(PackedElts, CurDAG, DL);
3172}
3173
3175 llvm::SelectionDAG *CurDAG,
3176 const SDLoc &DL, unsigned ElementSize) {
3177 if (ElementSize == 16)
3178 return buildRegSequence16(Elts, CurDAG, DL);
3179 if (ElementSize == 32)
3180 return buildRegSequence32(Elts, CurDAG, DL);
3181 llvm_unreachable("Unhandled element size");
3182}
3183
3184static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3186 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3187 unsigned ElementSize) {
3188 if (ModOpcode == ISD::FNEG) {
3189 Mods |= SISrcMods::NEG;
3190 // Check if all elements also have abs modifier
3191 SmallVector<SDValue, 8> NegAbsElts;
3192 for (auto El : Elts) {
3193 if (El.getOpcode() != ISD::FABS)
3194 break;
3195 NegAbsElts.push_back(El->getOperand(0));
3196 }
3197 if (Elts.size() != NegAbsElts.size()) {
3198 // Neg
3199 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3200 } else {
3201 // Neg and Abs
3202 Mods |= SISrcMods::NEG_HI;
3203 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3204 }
3205 } else {
3206 assert(ModOpcode == ISD::FABS);
3207 // Abs
3208 Mods |= SISrcMods::NEG_HI;
3209 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3210 }
3211}
3212
3213// Check all f16 elements for modifiers while looking through b32 and v2b16
3214// build vector, stop if element does not satisfy ModifierCheck.
3215static void
3217 std::function<bool(SDValue)> ModifierCheck) {
3218 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3219 if (auto *F16Pair =
3220 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3221 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3222 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3223 if (!ModifierCheck(ElF16))
3224 break;
3225 }
3226 }
3227 }
3228}
3229
3230bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3231 SDValue &SrcMods) const {
3232 Src = In;
3233 unsigned Mods = SISrcMods::OP_SEL_1;
3234
3235 // mods are on f16 elements
3236 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3238
3239 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3240 if (Element.getOpcode() != ISD::FNEG)
3241 return false;
3242 EltsF16.push_back(Element.getOperand(0));
3243 return true;
3244 });
3245
3246 // All elements have neg modifier
3247 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3248 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3249 Mods |= SISrcMods::NEG;
3250 Mods |= SISrcMods::NEG_HI;
3251 }
3252 }
3253
3254 // mods are on v2f16 elements
3255 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3256 SmallVector<SDValue, 8> EltsV2F16;
3257 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3258 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3259 // Based on first element decide which mod we match, neg or abs
3260 if (ElV2f16.getOpcode() != ISD::FNEG)
3261 break;
3262 EltsV2F16.push_back(ElV2f16.getOperand(0));
3263 }
3264
3265 // All pairs of elements have neg modifier
3266 if (BV->getNumOperands() == EltsV2F16.size()) {
3267 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3268 Mods |= SISrcMods::NEG;
3269 Mods |= SISrcMods::NEG_HI;
3270 }
3271 }
3272
3273 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3274 return true;
3275}
3276
3277bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3278 SDValue &SrcMods) const {
3279 Src = In;
3280 unsigned Mods = SISrcMods::OP_SEL_1;
3281 unsigned ModOpcode;
3282
3283 // mods are on f16 elements
3284 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3286 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3287 // Based on first element decide which mod we match, neg or abs
3288 if (EltsF16.empty())
3289 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3290 if (ElF16.getOpcode() != ModOpcode)
3291 return false;
3292 EltsF16.push_back(ElF16.getOperand(0));
3293 return true;
3294 });
3295
3296 // All elements have ModOpcode modifier
3297 if (BV->getNumOperands() * 2 == EltsF16.size())
3298 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3299 16);
3300 }
3301
3302 // mods are on v2f16 elements
3303 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3304 SmallVector<SDValue, 8> EltsV2F16;
3305
3306 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3307 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3308 // Based on first element decide which mod we match, neg or abs
3309 if (EltsV2F16.empty())
3310 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3311 if (ElV2f16->getOpcode() != ModOpcode)
3312 break;
3313 EltsV2F16.push_back(ElV2f16->getOperand(0));
3314 }
3315
3316 // All elements have ModOpcode modifier
3317 if (BV->getNumOperands() == EltsV2F16.size())
3318 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3319 32);
3320 }
3321
3322 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3323 return true;
3324}
3325
3326bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3327 SDValue &SrcMods) const {
3328 Src = In;
3329 unsigned Mods = SISrcMods::OP_SEL_1;
3331
3332 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3333 assert(BV->getNumOperands() > 0);
3334 // Based on first element decide which mod we match, neg or abs
3335 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3336 unsigned ModOpcode =
3337 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3338 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3339 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3340 if (ElF32.getOpcode() != ModOpcode)
3341 break;
3342 EltsF32.push_back(ElF32.getOperand(0));
3343 }
3344
3345 // All elements had ModOpcode modifier
3346 if (BV->getNumOperands() == EltsF32.size())
3347 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3348 32);
3349 }
3350
3351 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3352 return true;
3353}
3354
3355bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3356 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3357 BitVector UndefElements;
3358 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3359 if (isInlineImmediate(Splat.getNode())) {
3360 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3361 unsigned Imm = C->getAPIntValue().getSExtValue();
3362 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3363 return true;
3364 }
3365 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3366 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3367 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3368 return true;
3369 }
3370 llvm_unreachable("unhandled Constant node");
3371 }
3372 }
3373
3374 // 16 bit splat
3375 SDValue SplatSrc32 = stripBitcast(In);
3376 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3377 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3378 SDValue SplatSrc16 = stripBitcast(Splat32);
3379 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3380 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3381 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3382 std::optional<APInt> RawValue;
3383 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3384 RawValue = C->getValueAPF().bitcastToAPInt();
3385 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3386 RawValue = C->getAPIntValue();
3387
3388 if (RawValue.has_value()) {
3389 EVT VT = In.getValueType().getScalarType();
3390 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3391 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3394 RawValue.value());
3395 if (TII->isInlineConstant(FloatVal)) {
3396 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3397 MVT::i16);
3398 return true;
3399 }
3400 } else if (VT.getSimpleVT() == MVT::i16) {
3401 if (TII->isInlineConstant(RawValue.value())) {
3402 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3403 MVT::i16);
3404 return true;
3405 }
3406 } else
3407 llvm_unreachable("unknown 16-bit type");
3408 }
3409 }
3410 }
3411
3412 return false;
3413}
3414
3415bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3416 SDValue &IndexKey) const {
3417 unsigned Key = 0;
3418 Src = In;
3419
3420 if (In.getOpcode() == ISD::SRL) {
3421 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3422 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3423 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3424 ShiftAmt->getZExtValue() % 8 == 0) {
3425 Key = ShiftAmt->getZExtValue() / 8;
3426 Src = ShiftSrc;
3427 }
3428 }
3429
3430 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3431 return true;
3432}
3433
3434bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3435 SDValue &IndexKey) const {
3436 unsigned Key = 0;
3437 Src = In;
3438
3439 if (In.getOpcode() == ISD::SRL) {
3440 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3441 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3442 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3443 ShiftAmt->getZExtValue() == 16) {
3444 Key = 1;
3445 Src = ShiftSrc;
3446 }
3447 }
3448
3449 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3450 return true;
3451}
3452
3453bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3454 SDValue &SrcMods) const {
3455 Src = In;
3456 // FIXME: Handle op_sel
3457 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3458 return true;
3459}
3460
3461bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3462 SDValue &SrcMods) const {
3463 // FIXME: Handle op_sel
3464 return SelectVOP3Mods(In, Src, SrcMods);
3465}
3466
3467// The return value is not whether the match is possible (which it always is),
3468// but whether or not it a conversion is really used.
3469bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3470 unsigned &Mods) const {
3471 Mods = 0;
3472 SelectVOP3ModsImpl(In, Src, Mods);
3473
3474 if (Src.getOpcode() == ISD::FP_EXTEND) {
3475 Src = Src.getOperand(0);
3476 assert(Src.getValueType() == MVT::f16);
3477 Src = stripBitcast(Src);
3478
3479 // Be careful about folding modifiers if we already have an abs. fneg is
3480 // applied last, so we don't want to apply an earlier fneg.
3481 if ((Mods & SISrcMods::ABS) == 0) {
3482 unsigned ModsTmp;
3483 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3484
3485 if ((ModsTmp & SISrcMods::NEG) != 0)
3486 Mods ^= SISrcMods::NEG;
3487
3488 if ((ModsTmp & SISrcMods::ABS) != 0)
3489 Mods |= SISrcMods::ABS;
3490 }
3491
3492 // op_sel/op_sel_hi decide the source type and source.
3493 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3494 // If the sources's op_sel is set, it picks the high half of the source
3495 // register.
3496
3497 Mods |= SISrcMods::OP_SEL_1;
3498 if (isExtractHiElt(Src, Src)) {
3499 Mods |= SISrcMods::OP_SEL_0;
3500
3501 // TODO: Should we try to look for neg/abs here?
3502 }
3503
3504 return true;
3505 }
3506
3507 return false;
3508}
3509
3510bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3511 SDValue &SrcMods) const {
3512 unsigned Mods = 0;
3513 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3514 return false;
3515 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3516 return true;
3517}
3518
3519bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3520 SDValue &SrcMods) const {
3521 unsigned Mods = 0;
3522 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3523 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3524 return true;
3525}
3526
3527SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3528 if (In.isUndef())
3529 return CurDAG->getUNDEF(MVT::i32);
3530
3531 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3532 SDLoc SL(In);
3533 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3534 }
3535
3536 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3537 SDLoc SL(In);
3538 return CurDAG->getConstant(
3539 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3540 }
3541
3542 SDValue Src;
3543 if (isExtractHiElt(In, Src))
3544 return Src;
3545
3546 return SDValue();
3547}
3548
3549bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3551
3552 const SIRegisterInfo *SIRI =
3553 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3554 const SIInstrInfo * SII =
3555 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3556
3557 unsigned Limit = 0;
3558 bool AllUsesAcceptSReg = true;
3559 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3560 Limit < 10 && U != E; ++U, ++Limit) {
3561 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3562
3563 // If the register class is unknown, it could be an unknown
3564 // register class that needs to be an SGPR, e.g. an inline asm
3565 // constraint
3566 if (!RC || SIRI->isSGPRClass(RC))
3567 return false;
3568
3569 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3570 AllUsesAcceptSReg = false;
3571 SDNode * User = *U;
3572 if (User->isMachineOpcode()) {
3573 unsigned Opc = User->getMachineOpcode();
3574 const MCInstrDesc &Desc = SII->get(Opc);
3575 if (Desc.isCommutable()) {
3576 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3577 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3578 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3579 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3580 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3581 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3582 CommutedRC == &AMDGPU::VS_64RegClass)
3583 AllUsesAcceptSReg = true;
3584 }
3585 }
3586 }
3587 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3588 // commuting current user. This means have at least one use
3589 // that strictly require VGPR. Thus, we will not attempt to commute
3590 // other user instructions.
3591 if (!AllUsesAcceptSReg)
3592 break;
3593 }
3594 }
3595 return !AllUsesAcceptSReg && (Limit < 10);
3596}
3597
3598bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3599 auto Ld = cast<LoadSDNode>(N);
3600
3601 const MachineMemOperand *MMO = Ld->getMemOperand();
3602 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3603 return false;
3604
3605 return MMO->getSize().hasValue() &&
3606 Ld->getAlign() >=
3607 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3608 uint64_t(4))) &&
3609 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3610 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3611 (Subtarget->getScalarizeGlobalBehavior() &&
3612 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3613 Ld->isSimple() &&
3614 static_cast<const SITargetLowering *>(getTargetLowering())
3615 ->isMemOpHasNoClobberedMemOperand(N)));
3616}
3617
3620 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3621 bool IsModified = false;
3622 do {
3623 IsModified = false;
3624
3625 // Go over all selected nodes and try to fold them a bit more
3627 while (Position != CurDAG->allnodes_end()) {
3628 SDNode *Node = &*Position++;
3629 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3630 if (!MachineNode)
3631 continue;
3632
3633 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3634 if (ResNode != Node) {
3635 if (ResNode)
3636 ReplaceUses(Node, ResNode);
3637 IsModified = true;
3638 }
3639 }
3641 } while (IsModified);
3642}
3643
3645 CodeGenOptLevel OptLevel)
3647 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3648
unsigned const MachineRegisterInfo * MRI
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static MachineSDNode * buildRegSequence(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool isNoUnsignedWrap(SDValue Addr)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< SDValue > &Elts, SDValue &Src, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1500
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1615
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:339
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:467
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:471
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:626
bool hasDLInsts() const
Definition: GCNSubtarget.h:764
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:265
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:552
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:277
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:691
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:679
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:965
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:701
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:530
Generation getGeneration() const
Definition: GCNSubtarget.h:316
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:715
bool hasAddr64() const
Definition: GCNSubtarget.h:380
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:598
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:545
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:546
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:739
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:785
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:688
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:548
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:778
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1167
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1072
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:818
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:931
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:974
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1437
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:634
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:958
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:733
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:808
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1006
@ TargetFrameIndex
Definition: ISDOpcodes.h:172
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:826
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:916
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:897
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:814
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1111
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1645
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1574
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:267
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154