LLVM 23.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,
133 EVT DstTy, ArrayRef<SDValue> Elts,
134 ArrayRef<unsigned> SubRegClass,
135 const SDLoc &DL) {
136 assert(Elts.size() == SubRegClass.size() && "array size mismatch");
137 unsigned NumElts = Elts.size();
138 SmallVector<SDValue, 17> Ops(2 * NumElts + 1);
139 Ops[0] = (CurDAG.getTargetConstant(DstRegClass, DL, MVT::i32));
140 for (unsigned i = 0; i < NumElts; ++i) {
141 Ops[2 * i + 1] = Elts[i];
142 Ops[2 * i + 2] = CurDAG.getTargetConstant(SubRegClass[i], DL, MVT::i32);
143 }
144 return SDValue(
145 CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops), 0);
146}
147
148} // end anonymous namespace
149
151 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
152 false)
153INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
155#ifdef EXPENSIVE_CHECKS
158#endif
160 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
161 false)
162
163/// This pass converts a legalized DAG into a AMDGPU-specific
164// DAG, ready for instruction scheduling.
166 CodeGenOptLevel OptLevel) {
167 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
168}
169
173
175 Subtarget = &MF.getSubtarget<GCNSubtarget>();
176 Subtarget->checkSubtargetFeatures(MF.getFunction());
177 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
179}
180
181bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
182 // XXX - only need to list legal operations.
183 switch (Opc) {
184 case ISD::FADD:
185 case ISD::FSUB:
186 case ISD::FMUL:
187 case ISD::FDIV:
188 case ISD::FREM:
190 case ISD::UINT_TO_FP:
191 case ISD::SINT_TO_FP:
192 case ISD::FABS:
193 // Fabs is lowered to a bit operation, but it's an and which will clear the
194 // high bits anyway.
195 case ISD::FSQRT:
196 case ISD::FSIN:
197 case ISD::FCOS:
198 case ISD::FPOWI:
199 case ISD::FPOW:
200 case ISD::FLOG:
201 case ISD::FLOG2:
202 case ISD::FLOG10:
203 case ISD::FEXP:
204 case ISD::FEXP2:
205 case ISD::FCEIL:
206 case ISD::FTRUNC:
207 case ISD::FRINT:
208 case ISD::FNEARBYINT:
209 case ISD::FROUNDEVEN:
210 case ISD::FROUND:
211 case ISD::FFLOOR:
212 case ISD::FMINNUM:
213 case ISD::FMAXNUM:
214 case ISD::FLDEXP:
215 case AMDGPUISD::FRACT:
216 case AMDGPUISD::CLAMP:
217 case AMDGPUISD::COS_HW:
218 case AMDGPUISD::SIN_HW:
219 case AMDGPUISD::FMIN3:
220 case AMDGPUISD::FMAX3:
221 case AMDGPUISD::FMED3:
222 case AMDGPUISD::FMAD_FTZ:
223 case AMDGPUISD::RCP:
224 case AMDGPUISD::RSQ:
225 case AMDGPUISD::RCP_IFLAG:
226 // On gfx10, all 16-bit instructions preserve the high bits.
227 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
228 case ISD::FP_ROUND:
229 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
230 // high bits on gfx9.
231 // TODO: If we had the source node we could see if the source was fma/mad
233 case ISD::FMA:
234 case ISD::FMAD:
235 case AMDGPUISD::DIV_FIXUP:
237 default:
238 // fcopysign, select and others may be lowered to 32-bit bit operations
239 // which don't zero the high bits.
240 return false;
241 }
242}
243
245#ifdef EXPENSIVE_CHECKS
247 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248 for (auto &L : LI->getLoopsInPreorder()) {
249 assert(L->isLCSSAForm(DT));
250 }
251#endif
253}
254
263
265 assert(Subtarget->d16PreservesUnusedBits());
266 MVT VT = N->getValueType(0).getSimpleVT();
267 if (VT != MVT::v2i16 && VT != MVT::v2f16)
268 return false;
269
270 SDValue Lo = N->getOperand(0);
271 SDValue Hi = N->getOperand(1);
272
273 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
274
275 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
276 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
277 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
278
279 // Need to check for possible indirect dependencies on the other half of the
280 // vector to avoid introducing a cycle.
281 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
282 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
283
284 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
285 SDValue Ops[] = {
286 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
287 };
288
289 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
290 if (LdHi->getMemoryVT() == MVT::i8) {
291 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
292 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
293 } else {
294 assert(LdHi->getMemoryVT() == MVT::i16);
295 }
296
297 SDValue NewLoadHi =
298 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
299 Ops, LdHi->getMemoryVT(),
300 LdHi->getMemOperand());
301
302 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
303 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
304 return true;
305 }
306
307 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
308 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
309 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
310 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
311 if (LdLo && Lo.hasOneUse()) {
312 SDValue TiedIn = getHi16Elt(Hi);
313 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
314 return false;
315
316 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
317 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
318 if (LdLo->getMemoryVT() == MVT::i8) {
319 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
320 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
321 } else {
322 assert(LdLo->getMemoryVT() == MVT::i16);
323 }
324
325 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
326
327 SDValue Ops[] = {
328 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
329 };
330
331 SDValue NewLoadLo =
332 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
333 Ops, LdLo->getMemoryVT(),
334 LdLo->getMemOperand());
335
336 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
337 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
338 return true;
339 }
340
341 return false;
342}
343
345 if (!Subtarget->d16PreservesUnusedBits())
346 return;
347
348 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
349
350 bool MadeChange = false;
351 while (Position != CurDAG->allnodes_begin()) {
352 SDNode *N = &*--Position;
353 if (N->use_empty())
354 continue;
355
356 switch (N->getOpcode()) {
358 // TODO: Match load d16 from shl (extload:i16), 16
359 MadeChange |= matchLoadD16FromBuildVector(N);
360 break;
361 default:
362 break;
363 }
364 }
365
366 if (MadeChange) {
367 CurDAG->RemoveDeadNodes();
368 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
369 CurDAG->dump(););
370 }
371}
372
373bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
374 if (N->isUndef())
375 return true;
376
377 const SIInstrInfo *TII = Subtarget->getInstrInfo();
379 return TII->isInlineConstant(C->getAPIntValue());
380
382 return TII->isInlineConstant(C->getValueAPF());
383
384 return false;
385}
386
387/// Determine the register class for \p OpNo
388/// \returns The register class of the virtual register that will be used for
389/// the given operand number \OpNo or NULL if the register class cannot be
390/// determined.
391const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
392 unsigned OpNo) const {
393 if (!N->isMachineOpcode()) {
394 if (N->getOpcode() == ISD::CopyToReg) {
395 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
396 if (Reg.isVirtual()) {
398 return MRI.getRegClass(Reg);
399 }
400
401 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
402 return TRI->getPhysRegBaseClass(Reg);
403 }
404
405 return nullptr;
406 }
407
408 switch (N->getMachineOpcode()) {
409 default: {
410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
411 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
412 unsigned OpIdx = Desc.getNumDefs() + OpNo;
413 if (OpIdx >= Desc.getNumOperands())
414 return nullptr;
415
416 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
417 if (RegClass == -1)
418 return nullptr;
419
420 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
421 }
422 case AMDGPU::REG_SEQUENCE: {
423 unsigned RCID = N->getConstantOperandVal(0);
424 const TargetRegisterClass *SuperRC =
425 Subtarget->getRegisterInfo()->getRegClass(RCID);
426
427 SDValue SubRegOp = N->getOperand(OpNo + 1);
428 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
429 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
430 SubRegIdx);
431 }
432 }
433}
434
435SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
436 SDValue Glue) const {
438 Ops.push_back(NewChain); // Replace the chain.
439 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
440 Ops.push_back(N->getOperand(i));
441
442 Ops.push_back(Glue);
443 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
444}
445
446SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
447 const SITargetLowering& Lowering =
448 *static_cast<const SITargetLowering*>(getTargetLowering());
449
450 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
451
452 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
453 return glueCopyToOp(N, M0, M0.getValue(1));
454}
455
456SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
457 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
458 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
459 if (Subtarget->ldsRequiresM0Init())
460 return glueCopyToM0(
461 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
462 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
463 MachineFunction &MF = CurDAG->getMachineFunction();
464 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
465 return
466 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
467 }
468 return N;
469}
470
471MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
472 EVT VT) const {
473 SDNode *Lo = CurDAG->getMachineNode(
474 AMDGPU::S_MOV_B32, DL, MVT::i32,
475 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
476 SDNode *Hi = CurDAG->getMachineNode(
477 AMDGPU::S_MOV_B32, DL, MVT::i32,
478 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
479 const SDValue Ops[] = {
480 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
481 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
482 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
483
484 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
485}
486
487SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
488 SelectionDAG &DAG) const {
489 // TODO: Handle undef as zero
490
491 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
492 uint32_t LHSVal, RHSVal;
493 if (getConstantValue(N->getOperand(0), LHSVal) &&
494 getConstantValue(N->getOperand(1), RHSVal)) {
495 SDLoc SL(N);
496 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
497 return DAG.getMachineNode(
498 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
499 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
500 }
501
502 return nullptr;
503}
504
505void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
506 EVT VT = N->getValueType(0);
507 unsigned NumVectorElts = VT.getVectorNumElements();
508 EVT EltVT = VT.getVectorElementType();
509 SDLoc DL(N);
510 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
511
512 if (NumVectorElts == 1) {
513 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
514 RegClass);
515 return;
516 }
517
518 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
519 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
520 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
521 uint64_t C = 0;
522 bool AllConst = true;
523 unsigned EltSize = EltVT.getSizeInBits();
524 for (unsigned I = 0; I < NumVectorElts; ++I) {
525 SDValue Op = N->getOperand(I);
526 if (Op.isUndef()) {
527 AllConst = false;
528 break;
529 }
530 uint64_t Val;
532 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
533 } else
534 Val = cast<ConstantSDNode>(Op)->getZExtValue();
535 C |= Val << (EltSize * I);
536 }
537 if (AllConst) {
538 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
539 MachineSDNode *Copy =
540 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
541 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
542 RegClass);
543 return;
544 }
545 }
546
547 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
548 "supported yet");
549 // 32 = Max Num Vector Elements
550 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
551 // 1 = Vector Register Class
552 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
553
554 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
555 bool IsRegSeq = true;
556 unsigned NOps = N->getNumOperands();
557 for (unsigned i = 0; i < NOps; i++) {
558 // XXX: Why is this here?
559 if (isa<RegisterSDNode>(N->getOperand(i))) {
560 IsRegSeq = false;
561 break;
562 }
563 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
565 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
566 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
567 }
568 if (NOps != NumVectorElts) {
569 // Fill in the missing undef elements if this was a scalar_to_vector.
570 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
571 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
572 DL, EltVT);
573 for (unsigned i = NOps; i < NumVectorElts; ++i) {
574 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
576 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
577 RegSeqArgs[1 + (2 * i) + 1] =
578 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
579 }
580 }
581
582 if (!IsRegSeq)
583 SelectCode(N);
584 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
585}
586
588 EVT VT = N->getValueType(0);
589 EVT EltVT = VT.getVectorElementType();
590
591 // TODO: Handle 16-bit element vectors with even aligned masks.
592 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
593 VT.getVectorNumElements() != 2) {
594 SelectCode(N);
595 return;
596 }
597
598 auto *SVN = cast<ShuffleVectorSDNode>(N);
599
600 SDValue Src0 = SVN->getOperand(0);
601 SDValue Src1 = SVN->getOperand(1);
602 ArrayRef<int> Mask = SVN->getMask();
603 SDLoc DL(N);
604
605 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
606 Mask[0] < 4 && Mask[1] < 4);
607
608 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
609 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
610 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
611 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
612
613 if (Mask[0] < 0) {
614 Src0SubReg = Src1SubReg;
615 MachineSDNode *ImpDef =
616 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
617 VSrc0 = SDValue(ImpDef, 0);
618 }
619
620 if (Mask[1] < 0) {
621 Src1SubReg = Src0SubReg;
622 MachineSDNode *ImpDef =
623 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
624 VSrc1 = SDValue(ImpDef, 0);
625 }
626
627 // SGPR case needs to lower to copies.
628 //
629 // Also use subregister extract when we can directly blend the registers with
630 // a simple subregister copy.
631 //
632 // TODO: Maybe we should fold this out earlier
633 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
634 Src1SubReg == AMDGPU::sub0) {
635 // The low element of the result always comes from src0.
636 // The high element of the result always comes from src1.
637 // op_sel selects the high half of src0.
638 // op_sel_hi selects the high half of src1.
639
640 unsigned Src0OpSel =
641 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
642 unsigned Src1OpSel =
643 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
644
645 // Enable op_sel_hi to avoid printing it. This should have no effect on the
646 // result.
647 Src0OpSel |= SISrcMods::OP_SEL_1;
648 Src1OpSel |= SISrcMods::OP_SEL_1;
649
650 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
651 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
652 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
653
654 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
655 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
656 ZeroMods, // clamp
657 ZeroMods, // op_sel
658 ZeroMods, // op_sel_hi
659 ZeroMods, // neg_lo
660 ZeroMods}); // neg_hi
661 return;
662 }
663
664 SDValue ResultElt0 =
665 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
666 SDValue ResultElt1 =
667 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
668
669 const SDValue Ops[] = {
670 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
671 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
672 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
673 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
674}
675
677 unsigned int Opc = N->getOpcode();
678 if (N->isMachineOpcode()) {
679 N->setNodeId(-1);
680 return; // Already selected.
681 }
682
683 // isa<MemSDNode> almost works but is slightly too permissive for some DS
684 // intrinsics.
685 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
686 N = glueCopyToM0LDSInit(N);
687 SelectCode(N);
688 return;
689 }
690
691 switch (Opc) {
692 default:
693 break;
694 // We are selecting i64 ADD here instead of custom lower it during
695 // DAG legalization, so we can fold some i64 ADDs used for address
696 // calculation into the LOAD and STORE instructions.
697 case ISD::ADDC:
698 case ISD::ADDE:
699 case ISD::SUBC:
700 case ISD::SUBE: {
701 if (N->getValueType(0) != MVT::i64)
702 break;
703
704 SelectADD_SUB_I64(N);
705 return;
706 }
707 case ISD::UADDO_CARRY:
708 case ISD::USUBO_CARRY:
709 if (N->getValueType(0) != MVT::i32)
710 break;
711
712 SelectAddcSubb(N);
713 return;
714 case ISD::UADDO:
715 case ISD::USUBO: {
716 SelectUADDO_USUBO(N);
717 return;
718 }
719 case AMDGPUISD::FMUL_W_CHAIN: {
720 SelectFMUL_W_CHAIN(N);
721 return;
722 }
723 case AMDGPUISD::FMA_W_CHAIN: {
724 SelectFMA_W_CHAIN(N);
725 return;
726 }
727
729 case ISD::BUILD_VECTOR: {
730 EVT VT = N->getValueType(0);
731 unsigned NumVectorElts = VT.getVectorNumElements();
732 if (VT.getScalarSizeInBits() == 16) {
733 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
734 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
735 ReplaceNode(N, Packed);
736 return;
737 }
738 }
739
740 break;
741 }
742
743 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
744 assert(VT.getVectorElementType().bitsEq(MVT::i32));
745 const TargetRegisterClass *RegClass =
746 N->isDivergent()
747 ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
748 : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
749
750 SelectBuildVector(N, RegClass->getID());
751 return;
752 }
755 return;
756 case ISD::BUILD_PAIR: {
757 SDValue RC, SubReg0, SubReg1;
758 SDLoc DL(N);
759 if (N->getValueType(0) == MVT::i128) {
760 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
761 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
762 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
763 } else if (N->getValueType(0) == MVT::i64) {
764 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
765 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
766 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
767 } else {
768 llvm_unreachable("Unhandled value type for BUILD_PAIR");
769 }
770 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
771 N->getOperand(1), SubReg1 };
772 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
773 N->getValueType(0), Ops));
774 return;
775 }
776
777 case ISD::Constant:
778 case ISD::ConstantFP: {
779 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
780 Subtarget->has64BitLiterals())
781 break;
782
783 uint64_t Imm;
785 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
786 if (AMDGPU::isValid32BitLiteral(Imm, true))
787 break;
788 } else {
790 Imm = C->getZExtValue();
791 if (AMDGPU::isValid32BitLiteral(Imm, false))
792 break;
793 }
794
795 SDLoc DL(N);
796 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
797 return;
798 }
799 case AMDGPUISD::BFE_I32:
800 case AMDGPUISD::BFE_U32: {
801 // There is a scalar version available, but unlike the vector version which
802 // has a separate operand for the offset and width, the scalar version packs
803 // the width and offset into a single operand. Try to move to the scalar
804 // version if the offsets are constant, so that we can try to keep extended
805 // loads of kernel arguments in SGPRs.
806
807 // TODO: Technically we could try to pattern match scalar bitshifts of
808 // dynamic values, but it's probably not useful.
810 if (!Offset)
811 break;
812
813 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
814 if (!Width)
815 break;
816
817 bool Signed = Opc == AMDGPUISD::BFE_I32;
818
819 uint32_t OffsetVal = Offset->getZExtValue();
820 uint32_t WidthVal = Width->getZExtValue();
821
822 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
823 WidthVal));
824 return;
825 }
826 case AMDGPUISD::DIV_SCALE: {
827 SelectDIV_SCALE(N);
828 return;
829 }
832 SelectMAD_64_32(N);
833 return;
834 }
835 case ISD::SMUL_LOHI:
836 case ISD::UMUL_LOHI:
837 return SelectMUL_LOHI(N);
838 case ISD::CopyToReg: {
840 *static_cast<const SITargetLowering*>(getTargetLowering());
841 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
842 break;
843 }
844 case ISD::AND:
845 case ISD::SRL:
846 case ISD::SRA:
848 if (N->getValueType(0) != MVT::i32)
849 break;
850
851 SelectS_BFE(N);
852 return;
853 case ISD::BRCOND:
854 SelectBRCOND(N);
855 return;
856 case ISD::FP_EXTEND:
857 SelectFP_EXTEND(N);
858 return;
859 case AMDGPUISD::CVT_PKRTZ_F16_F32:
860 case AMDGPUISD::CVT_PKNORM_I16_F32:
861 case AMDGPUISD::CVT_PKNORM_U16_F32:
862 case AMDGPUISD::CVT_PK_U16_U32:
863 case AMDGPUISD::CVT_PK_I16_I32: {
864 // Hack around using a legal type if f16 is illegal.
865 if (N->getValueType(0) == MVT::i32) {
866 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
867 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
868 { N->getOperand(0), N->getOperand(1) });
869 SelectCode(N);
870 return;
871 }
872
873 break;
874 }
876 SelectINTRINSIC_W_CHAIN(N);
877 return;
878 }
880 SelectINTRINSIC_WO_CHAIN(N);
881 return;
882 }
883 case ISD::INTRINSIC_VOID: {
884 SelectINTRINSIC_VOID(N);
885 return;
886 }
888 SelectWAVE_ADDRESS(N);
889 return;
890 }
891 case ISD::STACKRESTORE: {
892 SelectSTACKRESTORE(N);
893 return;
894 }
895 }
896
897 SelectCode(N);
898}
899
901 if (!Subtarget->hasSDWA())
902 return false;
903
904 if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
905 EVT VT = cast<VTSDNode>(N->getOperand(1))->getVT();
906 return VT.getScalarSizeInBits() == 8 || VT.getScalarSizeInBits() == 16;
907 }
908
909 if (N->getOpcode() == ISD::AND)
910 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
911 return RHS->getZExtValue() == 0xFF || RHS->getZExtValue() == 0xFFFF;
912
913 if (N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL)
914 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
915 return (RHS->getZExtValue() % 8) == 0;
916
917 return false;
918}
919
920bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
921 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
922 const Instruction *Term = BB->getTerminator();
923 return Term->getMetadata("amdgpu.uniform") ||
924 Term->getMetadata("structurizecfg.uniform");
925}
926
927bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
928 unsigned ShAmtBits) const {
929 assert(N->getOpcode() == ISD::AND);
930
931 const APInt &RHS = N->getConstantOperandAPInt(1);
932 if (RHS.countr_one() >= ShAmtBits)
933 return true;
934
935 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
936 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
937}
938
940 SDValue &N0, SDValue &N1) {
941 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
943 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
944 // (i64 (bitcast (v2i32 (build_vector
945 // (or (extract_vector_elt V, 0), OFFSET),
946 // (extract_vector_elt V, 1)))))
947 SDValue Lo = Addr.getOperand(0).getOperand(0);
948 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
949 SDValue BaseLo = Lo.getOperand(0);
950 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
951 // Check that split base (Lo and Hi) are extracted from the same one.
952 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
954 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
955 // Lo is statically extracted from index 0.
956 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
957 BaseLo.getConstantOperandVal(1) == 0 &&
958 // Hi is statically extracted from index 0.
959 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
960 BaseHi.getConstantOperandVal(1) == 1) {
961 N0 = BaseLo.getOperand(0).getOperand(0);
962 N1 = Lo.getOperand(1);
963 return true;
964 }
965 }
966 }
967 return false;
968}
969
970bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
971 SDValue &RHS) const {
972 if (CurDAG->isBaseWithConstantOffset(Addr)) {
973 LHS = Addr.getOperand(0);
974 RHS = Addr.getOperand(1);
975 return true;
976 }
977
980 return true;
981 }
982
983 return false;
984}
985
987 return "AMDGPU DAG->DAG Pattern Instruction Selection";
988}
989
993
997#ifdef EXPENSIVE_CHECKS
999 .getManager();
1000 auto &F = MF.getFunction();
1001 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
1002 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
1003 for (auto &L : LI.getLoopsInPreorder())
1004 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
1005#endif
1006 return SelectionDAGISelPass::run(MF, MFAM);
1007}
1008
1009//===----------------------------------------------------------------------===//
1010// Complex Patterns
1011//===----------------------------------------------------------------------===//
1012
1013bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
1014 SDValue &Offset) {
1015 return false;
1016}
1017
1018bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
1019 SDValue &Offset) {
1021 SDLoc DL(Addr);
1022
1023 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
1024 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1025 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1026 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
1027 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
1028 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1029 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1030 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
1031 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
1032 Base = Addr.getOperand(0);
1033 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1034 } else {
1035 Base = Addr;
1036 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1037 }
1038
1039 return true;
1040}
1041
1042SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1043 const SDLoc &DL) const {
1044 SDNode *Mov = CurDAG->getMachineNode(
1045 AMDGPU::S_MOV_B32, DL, MVT::i32,
1046 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1047 return SDValue(Mov, 0);
1048}
1049
1050// FIXME: Should only handle uaddo_carry/usubo_carry
1051void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1052 SDLoc DL(N);
1053 SDValue LHS = N->getOperand(0);
1054 SDValue RHS = N->getOperand(1);
1055
1056 unsigned Opcode = N->getOpcode();
1057 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1058 bool ProduceCarry =
1059 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1060 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1061
1062 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1063 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1064
1065 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1066 DL, MVT::i32, LHS, Sub0);
1067 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1068 DL, MVT::i32, LHS, Sub1);
1069
1070 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1071 DL, MVT::i32, RHS, Sub0);
1072 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1073 DL, MVT::i32, RHS, Sub1);
1074
1075 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1076
1077 static const unsigned OpcMap[2][2][2] = {
1078 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1079 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1080 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1081 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1082
1083 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1084 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1085
1086 SDNode *AddLo;
1087 if (!ConsumeCarry) {
1088 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1089 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1090 } else {
1091 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1092 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1093 }
1094 SDValue AddHiArgs[] = {
1095 SDValue(Hi0, 0),
1096 SDValue(Hi1, 0),
1097 SDValue(AddLo, 1)
1098 };
1099 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1100
1101 SDValue RegSequenceArgs[] = {
1102 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1103 SDValue(AddLo,0),
1104 Sub0,
1105 SDValue(AddHi,0),
1106 Sub1,
1107 };
1108 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1109 MVT::i64, RegSequenceArgs);
1110
1111 if (ProduceCarry) {
1112 // Replace the carry-use
1113 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1114 }
1115
1116 // Replace the remaining uses.
1117 ReplaceNode(N, RegSequence);
1118}
1119
1120void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1121 SDValue LHS = N->getOperand(0);
1122 SDValue RHS = N->getOperand(1);
1123 SDValue CI = N->getOperand(2);
1124
1125 if (N->isDivergent()) {
1126 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1127 : AMDGPU::V_SUBB_U32_e64;
1128 CurDAG->SelectNodeTo(
1129 N, Opc, N->getVTList(),
1130 {LHS, RHS, CI,
1131 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1132 } else {
1133 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1134 : AMDGPU::S_SUB_CO_PSEUDO;
1135 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1136 }
1137}
1138
1139void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1140 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1141 // carry out despite the _i32 name. These were renamed in VI to _U32.
1142 // FIXME: We should probably rename the opcodes here.
1143 bool IsAdd = N->getOpcode() == ISD::UADDO;
1144 bool IsVALU = N->isDivergent();
1145
1146 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1147 ++UI)
1148 if (UI.getUse().getResNo() == 1) {
1149 if (UI->isMachineOpcode()) {
1150 if (UI->getMachineOpcode() !=
1151 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1152 IsVALU = true;
1153 break;
1154 }
1155 } else {
1156 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1157 IsVALU = true;
1158 break;
1159 }
1160 }
1161 }
1162
1163 if (IsVALU) {
1164 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1165
1166 CurDAG->SelectNodeTo(
1167 N, Opc, N->getVTList(),
1168 {N->getOperand(0), N->getOperand(1),
1169 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1170 } else {
1171 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1172
1173 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1174 {N->getOperand(0), N->getOperand(1)});
1175 }
1176}
1177
1178void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1179 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1180 SDValue Ops[10];
1181
1182 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1183 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1184 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1185 Ops[8] = N->getOperand(0);
1186 Ops[9] = N->getOperand(4);
1187
1188 // If there are no source modifiers, prefer fmac over fma because it can use
1189 // the smaller VOP2 encoding.
1190 bool UseFMAC = Subtarget->hasDLInsts() &&
1191 cast<ConstantSDNode>(Ops[0])->isZero() &&
1192 cast<ConstantSDNode>(Ops[2])->isZero() &&
1193 cast<ConstantSDNode>(Ops[4])->isZero();
1194 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1195 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1196}
1197
1198void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1199 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1200 SDValue Ops[8];
1201
1202 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1203 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1204 Ops[6] = N->getOperand(0);
1205 Ops[7] = N->getOperand(3);
1206
1207 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1208}
1209
1210// We need to handle this here because tablegen doesn't support matching
1211// instructions with multiple outputs.
1212void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1213 EVT VT = N->getValueType(0);
1214
1215 assert(VT == MVT::f32 || VT == MVT::f64);
1216
1217 unsigned Opc
1218 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1219
1220 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1221 // omod
1222 SDValue Ops[8];
1223 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1224 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1225 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1226 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1227}
1228
1229// We need to handle this here because tablegen doesn't support matching
1230// instructions with multiple outputs.
1231void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1232 SDLoc SL(N);
1233 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1234 unsigned Opc;
1235 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
1236 if (Subtarget->hasMADIntraFwdBug())
1237 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1238 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1239 else if (UseNoCarry)
1240 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1241 else
1242 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1243
1244 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1245 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1246 Clamp };
1247
1248 if (UseNoCarry) {
1249 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1250 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1251 CurDAG->RemoveDeadNode(N);
1252 return;
1253 }
1254
1255 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1256}
1257
1258// We need to handle this here because tablegen doesn't support matching
1259// instructions with multiple outputs.
1260void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1261 SDLoc SL(N);
1262 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1263 SDVTList VTList;
1264 unsigned Opc;
1265 if (Subtarget->hasMadU64U32NoCarry()) {
1266 VTList = CurDAG->getVTList(MVT::i64);
1267 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1268 } else {
1269 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1270 if (Subtarget->hasMADIntraFwdBug()) {
1271 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1272 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1273 } else {
1274 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1275 }
1276 }
1277
1278 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1279 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1280 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1281 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1282 if (!SDValue(N, 0).use_empty()) {
1283 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1284 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1285 MVT::i32, SDValue(Mad, 0), Sub0);
1286 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1287 }
1288 if (!SDValue(N, 1).use_empty()) {
1289 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1290 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1291 MVT::i32, SDValue(Mad, 0), Sub1);
1292 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1293 }
1294 CurDAG->RemoveDeadNode(N);
1295}
1296
1297bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1298 if (!isUInt<16>(Offset))
1299 return false;
1300
1301 if (!Base || Subtarget->hasUsableDSOffset() ||
1302 Subtarget->unsafeDSOffsetFoldingEnabled())
1303 return true;
1304
1305 // On Southern Islands instruction with a negative base value and an offset
1306 // don't seem to work.
1307 return CurDAG->SignBitIsZero(Base);
1308}
1309
1310bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1311 SDValue &Offset) const {
1312 SDLoc DL(Addr);
1313 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1314 SDValue N0 = Addr.getOperand(0);
1315 SDValue N1 = Addr.getOperand(1);
1316 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1317 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1318 // (add n0, c0)
1319 Base = N0;
1320 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1321 return true;
1322 }
1323 } else if (Addr.getOpcode() == ISD::SUB) {
1324 // sub C, x -> add (sub 0, x), C
1325 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1326 int64_t ByteOffset = C->getSExtValue();
1327 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1328 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1329
1330 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1331 // the known bits in isDSOffsetLegal. We need to emit the selected node
1332 // here, so this is thrown away.
1333 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1334 Zero, Addr.getOperand(1));
1335
1336 if (isDSOffsetLegal(Sub, ByteOffset)) {
1338 Opnds.push_back(Zero);
1339 Opnds.push_back(Addr.getOperand(1));
1340
1341 // FIXME: Select to VOP3 version for with-carry.
1342 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1343 if (Subtarget->hasAddNoCarryInsts()) {
1344 SubOp = AMDGPU::V_SUB_U32_e64;
1345 Opnds.push_back(
1346 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1347 }
1348
1349 MachineSDNode *MachineSub =
1350 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1351
1352 Base = SDValue(MachineSub, 0);
1353 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1354 return true;
1355 }
1356 }
1357 }
1358 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1359 // If we have a constant address, prefer to put the constant into the
1360 // offset. This can save moves to load the constant address since multiple
1361 // operations can share the zero base address register, and enables merging
1362 // into read2 / write2 instructions.
1363
1364 SDLoc DL(Addr);
1365
1366 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1367 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1368 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1369 DL, MVT::i32, Zero);
1370 Base = SDValue(MovZero, 0);
1371 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1372 return true;
1373 }
1374 }
1375
1376 // default case
1377 Base = Addr;
1378 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1379 return true;
1380}
1381
1382bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1383 unsigned Offset1,
1384 unsigned Size) const {
1385 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1386 return false;
1387 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1388 return false;
1389
1390 if (!Base || Subtarget->hasUsableDSOffset() ||
1391 Subtarget->unsafeDSOffsetFoldingEnabled())
1392 return true;
1393
1394 // On Southern Islands instruction with a negative base value and an offset
1395 // don't seem to work.
1396 return CurDAG->SignBitIsZero(Base);
1397}
1398
1399// Return whether the operation has NoUnsignedWrap property.
1400static bool isNoUnsignedWrap(SDValue Addr) {
1401 return (Addr.getOpcode() == ISD::ADD &&
1402 Addr->getFlags().hasNoUnsignedWrap()) ||
1403 Addr->getOpcode() == ISD::OR;
1404}
1405
1406// Check that the base address of flat scratch load/store in the form of `base +
1407// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1408// requirement). We always treat the first operand as the base address here.
1409bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1410 if (isNoUnsignedWrap(Addr))
1411 return true;
1412
1413 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1414 // values.
1415 if (Subtarget->hasSignedScratchOffsets())
1416 return true;
1417
1418 auto LHS = Addr.getOperand(0);
1419 auto RHS = Addr.getOperand(1);
1420
1421 // If the immediate offset is negative and within certain range, the base
1422 // address cannot also be negative. If the base is also negative, the sum
1423 // would be either negative or much larger than the valid range of scratch
1424 // memory a thread can access.
1425 ConstantSDNode *ImmOp = nullptr;
1426 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1427 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1428 return true;
1429 }
1430
1431 return CurDAG->SignBitIsZero(LHS);
1432}
1433
1434// Check address value in SGPR/VGPR are legal for flat scratch in the form
1435// of: SGPR + VGPR.
1436bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1437 if (isNoUnsignedWrap(Addr))
1438 return true;
1439
1440 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1441 // values.
1442 if (Subtarget->hasSignedScratchOffsets())
1443 return true;
1444
1445 auto LHS = Addr.getOperand(0);
1446 auto RHS = Addr.getOperand(1);
1447 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1448}
1449
1450// Check address value in SGPR/VGPR are legal for flat scratch in the form
1451// of: SGPR + VGPR + Imm.
1452bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1453 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1454 // values.
1455 if (AMDGPU::isGFX12Plus(*Subtarget))
1456 return true;
1457
1458 auto Base = Addr.getOperand(0);
1459 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1460 // If the immediate offset is negative and within certain range, the base
1461 // address cannot also be negative. If the base is also negative, the sum
1462 // would be either negative or much larger than the valid range of scratch
1463 // memory a thread can access.
1464 if (isNoUnsignedWrap(Base) &&
1465 (isNoUnsignedWrap(Addr) ||
1466 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1467 return true;
1468
1469 auto LHS = Base.getOperand(0);
1470 auto RHS = Base.getOperand(1);
1471 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1472}
1473
1474// TODO: If offset is too big, put low 16-bit into offset.
1475bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1476 SDValue &Offset0,
1477 SDValue &Offset1) const {
1478 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1479}
1480
1481bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1482 SDValue &Offset0,
1483 SDValue &Offset1) const {
1484 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1485}
1486
1487bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1488 SDValue &Offset0, SDValue &Offset1,
1489 unsigned Size) const {
1490 SDLoc DL(Addr);
1491
1492 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1493 SDValue N0 = Addr.getOperand(0);
1494 SDValue N1 = Addr.getOperand(1);
1495 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1496 unsigned OffsetValue0 = C1->getZExtValue();
1497 unsigned OffsetValue1 = OffsetValue0 + Size;
1498
1499 // (add n0, c0)
1500 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1501 Base = N0;
1502 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1503 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1504 return true;
1505 }
1506 } else if (Addr.getOpcode() == ISD::SUB) {
1507 // sub C, x -> add (sub 0, x), C
1508 if (const ConstantSDNode *C =
1510 unsigned OffsetValue0 = C->getZExtValue();
1511 unsigned OffsetValue1 = OffsetValue0 + Size;
1512
1513 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1514 SDLoc DL(Addr);
1515 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1516
1517 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1518 // the known bits in isDSOffsetLegal. We need to emit the selected node
1519 // here, so this is thrown away.
1520 SDValue Sub =
1521 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1522
1523 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1525 Opnds.push_back(Zero);
1526 Opnds.push_back(Addr.getOperand(1));
1527 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1528 if (Subtarget->hasAddNoCarryInsts()) {
1529 SubOp = AMDGPU::V_SUB_U32_e64;
1530 Opnds.push_back(
1531 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1532 }
1533
1534 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1535 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1536
1537 Base = SDValue(MachineSub, 0);
1538 Offset0 =
1539 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1540 Offset1 =
1541 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1542 return true;
1543 }
1544 }
1545 }
1546 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1547 unsigned OffsetValue0 = CAddr->getZExtValue();
1548 unsigned OffsetValue1 = OffsetValue0 + Size;
1549
1550 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1551 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1552 MachineSDNode *MovZero =
1553 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1554 Base = SDValue(MovZero, 0);
1555 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1556 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1557 return true;
1558 }
1559 }
1560
1561 // default case
1562
1563 Base = Addr;
1564 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1565 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1566 return true;
1567}
1568
1569bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1570 SDValue &SOffset, SDValue &Offset,
1571 SDValue &Offen, SDValue &Idxen,
1572 SDValue &Addr64) const {
1573 // Subtarget prefers to use flat instruction
1574 // FIXME: This should be a pattern predicate and not reach here
1575 if (Subtarget->useFlatForGlobal())
1576 return false;
1577
1578 SDLoc DL(Addr);
1579
1580 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1581 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1582 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1583 SOffset = Subtarget->hasRestrictedSOffset()
1584 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1585 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1586
1587 ConstantSDNode *C1 = nullptr;
1588 SDValue N0 = Addr;
1589 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1590 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1591 if (isUInt<32>(C1->getZExtValue()))
1592 N0 = Addr.getOperand(0);
1593 else
1594 C1 = nullptr;
1595 }
1596
1597 if (N0->isAnyAdd()) {
1598 // (add N2, N3) -> addr64, or
1599 // (add (add N2, N3), C1) -> addr64
1600 SDValue N2 = N0.getOperand(0);
1601 SDValue N3 = N0.getOperand(1);
1602 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1603
1604 if (N2->isDivergent()) {
1605 if (N3->isDivergent()) {
1606 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1607 // addr64, and construct the resource from a 0 address.
1608 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1609 VAddr = N0;
1610 } else {
1611 // N2 is divergent, N3 is not.
1612 Ptr = N3;
1613 VAddr = N2;
1614 }
1615 } else {
1616 // N2 is not divergent.
1617 Ptr = N2;
1618 VAddr = N3;
1619 }
1620 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1621 } else if (N0->isDivergent()) {
1622 // N0 is divergent. Use it as the addr64, and construct the resource from a
1623 // 0 address.
1624 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1625 VAddr = N0;
1626 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1627 } else {
1628 // N0 -> offset, or
1629 // (N0 + C1) -> offset
1630 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1631 Ptr = N0;
1632 }
1633
1634 if (!C1) {
1635 // No offset.
1636 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1637 return true;
1638 }
1639
1640 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1641 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1642 // Legal offset for instruction.
1643 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1644 return true;
1645 }
1646
1647 // Illegal offset, store it in soffset.
1648 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1649 SOffset =
1650 SDValue(CurDAG->getMachineNode(
1651 AMDGPU::S_MOV_B32, DL, MVT::i32,
1652 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1653 0);
1654 return true;
1655}
1656
1657bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1658 SDValue &VAddr, SDValue &SOffset,
1659 SDValue &Offset) const {
1660 SDValue Ptr, Offen, Idxen, Addr64;
1661
1662 // addr64 bit was removed for volcanic islands.
1663 // FIXME: This should be a pattern predicate and not reach here
1664 if (!Subtarget->hasAddr64())
1665 return false;
1666
1667 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1668 return false;
1669
1670 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1671 if (C->getSExtValue()) {
1672 SDLoc DL(Addr);
1673
1674 const SITargetLowering& Lowering =
1675 *static_cast<const SITargetLowering*>(getTargetLowering());
1676
1677 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1678 return true;
1679 }
1680
1681 return false;
1682}
1683
1684std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1685 SDLoc DL(N);
1686
1687 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1688 SDValue TFI =
1689 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1690
1691 // We rebase the base address into an absolute stack address and hence
1692 // use constant 0 for soffset. This value must be retained until
1693 // frame elimination and eliminateFrameIndex will choose the appropriate
1694 // frame register if need be.
1695 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1696}
1697
1698bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1699 SDValue Addr, SDValue &Rsrc,
1700 SDValue &VAddr, SDValue &SOffset,
1701 SDValue &ImmOffset) const {
1702
1703 SDLoc DL(Addr);
1704 MachineFunction &MF = CurDAG->getMachineFunction();
1705 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1706
1707 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1708
1709 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1710 int64_t Imm = CAddr->getSExtValue();
1711 const int64_t NullPtr =
1713 // Don't fold null pointer.
1714 if (Imm != NullPtr) {
1715 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1716 SDValue HighBits =
1717 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1718 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1719 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1720 VAddr = SDValue(MovHighBits, 0);
1721
1722 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1723 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1724 return true;
1725 }
1726 }
1727
1728 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1729 // (add n0, c1)
1730
1731 SDValue N0 = Addr.getOperand(0);
1732 uint64_t C1 = Addr.getConstantOperandVal(1);
1733
1734 // Offsets in vaddr must be positive if range checking is enabled.
1735 //
1736 // The total computation of vaddr + soffset + offset must not overflow. If
1737 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1738 // overflowing.
1739 //
1740 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1741 // always perform a range check. If a negative vaddr base index was used,
1742 // this would fail the range check. The overall address computation would
1743 // compute a valid address, but this doesn't happen due to the range
1744 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1745 //
1746 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1747 // MUBUF vaddr, but not on older subtargets which can only do this if the
1748 // sign bit is known 0.
1749 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1750 if (TII->isLegalMUBUFImmOffset(C1) &&
1751 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1752 CurDAG->SignBitIsZero(N0))) {
1753 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1754 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1755 return true;
1756 }
1757 }
1758
1759 // (node)
1760 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1761 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1762 return true;
1763}
1764
1765static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1766 if (Val.getOpcode() != ISD::CopyFromReg)
1767 return false;
1768 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1769 if (!Reg.isPhysical())
1770 return false;
1771 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1772 return RC && TRI.isSGPRClass(RC);
1773}
1774
1775bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1776 SDValue Addr,
1777 SDValue &SRsrc,
1778 SDValue &SOffset,
1779 SDValue &Offset) const {
1780 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1781 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1782 MachineFunction &MF = CurDAG->getMachineFunction();
1783 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1784 SDLoc DL(Addr);
1785
1786 // CopyFromReg <sgpr>
1787 if (IsCopyFromSGPR(*TRI, Addr)) {
1788 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1789 SOffset = Addr;
1790 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1791 return true;
1792 }
1793
1794 ConstantSDNode *CAddr;
1795 if (Addr.getOpcode() == ISD::ADD) {
1796 // Add (CopyFromReg <sgpr>) <constant>
1797 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1798 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1799 return false;
1800 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1801 return false;
1802
1803 SOffset = Addr.getOperand(0);
1804 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1805 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1806 // <constant>
1807 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1808 } else {
1809 return false;
1810 }
1811
1812 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1813
1814 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1815 return true;
1816}
1817
1818bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1819 SDValue &SOffset, SDValue &Offset
1820 ) const {
1821 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1822 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1823
1824 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1825 return false;
1826
1827 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1828 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1829 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1830 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1831 maskTrailingOnes<uint64_t>(32); // Size
1832 SDLoc DL(Addr);
1833
1834 const SITargetLowering& Lowering =
1835 *static_cast<const SITargetLowering*>(getTargetLowering());
1836
1837 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1838 return true;
1839 }
1840 return false;
1841}
1842
1843bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1844 SDValue &SOffset) const {
1845 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1846 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1847 return true;
1848 }
1849
1850 SOffset = ByteOffsetNode;
1851 return true;
1852}
1853
1854// Find a load or store from corresponding pattern root.
1855// Roots may be build_vector, bitconvert or their combinations.
1858 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1859 return MN;
1861 for (SDValue V : N->op_values())
1862 if (MemSDNode *MN =
1864 return MN;
1865 llvm_unreachable("cannot find MemSDNode in the pattern!");
1866}
1867
1868bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1869 SDValue &VAddr, SDValue &Offset,
1870 uint64_t FlatVariant) const {
1871 int64_t OffsetVal = 0;
1872
1873 unsigned AS = findMemSDNode(N)->getAddressSpace();
1874
1875 bool CanHaveFlatSegmentOffsetBug =
1876 Subtarget->hasFlatSegmentOffsetBug() &&
1877 FlatVariant == SIInstrFlags::FLAT &&
1879
1880 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1881 SDValue N0, N1;
1882 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1883 (FlatVariant != SIInstrFlags::FlatScratch ||
1884 isFlatScratchBaseLegal(Addr))) {
1885 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1886
1887 // Adding the offset to the base address in a FLAT instruction must not
1888 // change the memory aperture in which the address falls. Therefore we can
1889 // only fold offsets from inbounds GEPs into FLAT instructions.
1890 bool IsInBounds =
1891 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1892 if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1893 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1894 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1895 Addr = N0;
1896 OffsetVal = COffsetVal;
1897 } else {
1898 // If the offset doesn't fit, put the low bits into the offset field
1899 // and add the rest.
1900 //
1901 // For a FLAT instruction the hardware decides whether to access
1902 // global/scratch/shared memory based on the high bits of vaddr,
1903 // ignoring the offset field, so we have to ensure that when we add
1904 // remainder to vaddr it still points into the same underlying object.
1905 // The easiest way to do that is to make sure that we split the offset
1906 // into two pieces that are both >= 0 or both <= 0.
1907
1908 SDLoc DL(N);
1909 uint64_t RemainderOffset;
1910
1911 std::tie(OffsetVal, RemainderOffset) =
1912 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1913
1914 SDValue AddOffsetLo =
1915 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1916 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1917
1918 if (Addr.getValueType().getSizeInBits() == 32) {
1920 Opnds.push_back(N0);
1921 Opnds.push_back(AddOffsetLo);
1922 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1923 if (Subtarget->hasAddNoCarryInsts()) {
1924 AddOp = AMDGPU::V_ADD_U32_e64;
1925 Opnds.push_back(Clamp);
1926 }
1927 Addr =
1928 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1929 } else {
1930 // TODO: Should this try to use a scalar add pseudo if the base
1931 // address is uniform and saddr is usable?
1932 SDValue Sub0 =
1933 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1934 SDValue Sub1 =
1935 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1936
1937 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1938 DL, MVT::i32, N0, Sub0);
1939 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1940 DL, MVT::i32, N0, Sub1);
1941
1942 SDValue AddOffsetHi =
1943 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1944
1945 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1946
1947 SDNode *Add =
1948 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1949 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1950
1951 SDNode *Addc = CurDAG->getMachineNode(
1952 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1953 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1954
1955 SDValue RegSequenceArgs[] = {
1956 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1957 MVT::i32),
1958 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1959
1960 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1961 MVT::i64, RegSequenceArgs),
1962 0);
1963 }
1964 }
1965 }
1966 }
1967 }
1968
1969 VAddr = Addr;
1970 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1971 return true;
1972}
1973
1974bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1975 SDValue &VAddr,
1976 SDValue &Offset) const {
1977 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1978}
1979
1980bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1981 SDValue &VAddr,
1982 SDValue &Offset) const {
1983 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1984}
1985
1986bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1987 SDValue &VAddr,
1988 SDValue &Offset) const {
1989 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1991}
1992
1993// If this matches *_extend i32:x, return x
1994// Otherwise if the value is I32 returns x.
1996 const SelectionDAG *DAG) {
1997 if (Op.getValueType() == MVT::i32)
1998 return Op;
1999
2000 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
2001 Op.getOpcode() != ISD::ANY_EXTEND &&
2002 !(DAG->SignBitIsZero(Op) &&
2003 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
2004 return SDValue();
2005
2006 SDValue ExtSrc = Op.getOperand(0);
2007 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
2008}
2009
2010// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
2011// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
2012bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2013 SDValue &SAddr, SDValue &VOffset,
2014 SDValue &Offset, bool &ScaleOffset,
2015 bool NeedIOffset) const {
2016 int64_t ImmOffset = 0;
2017 ScaleOffset = false;
2018
2019 // Match the immediate offset first, which canonically is moved as low as
2020 // possible.
2021
2022 SDValue LHS, RHS;
2023 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2024 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2025 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2026
2027 if (NeedIOffset &&
2028 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
2030 Addr = LHS;
2031 ImmOffset = COffsetVal;
2032 } else if (!LHS->isDivergent()) {
2033 if (COffsetVal > 0) {
2034 SDLoc SL(N);
2035 // saddr + large_offset -> saddr +
2036 // (voffset = large_offset & ~MaxOffset) +
2037 // (large_offset & MaxOffset);
2038 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2039 if (NeedIOffset) {
2040 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2042 }
2043
2044 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2045 : isUInt<32>(RemainderOffset)) {
2046 SDNode *VMov = CurDAG->getMachineNode(
2047 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2048 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2049 VOffset = SDValue(VMov, 0);
2050 SAddr = LHS;
2051 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2052 return true;
2053 }
2054 }
2055
2056 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2057 // is 1 we would need to perform 1 or 2 extra moves for each half of
2058 // the constant and it is better to do a scalar add and then issue a
2059 // single VALU instruction to materialize zero. Otherwise it is less
2060 // instructions to perform VALU adds with immediates or inline literals.
2061 unsigned NumLiterals =
2062 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2063 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2064 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2065 return false;
2066 }
2067 }
2068
2069 // Match the variable offset.
2070 if (Addr->isAnyAdd()) {
2071 LHS = Addr.getOperand(0);
2072
2073 if (!LHS->isDivergent()) {
2074 // add (i64 sgpr), (*_extend (i32 vgpr))
2075 RHS = Addr.getOperand(1);
2076 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2077 if (SDValue ExtRHS = matchExtFromI32orI32(
2078 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2079 SAddr = LHS;
2080 VOffset = ExtRHS;
2081 }
2082 }
2083
2084 RHS = Addr.getOperand(1);
2085 if (!SAddr && !RHS->isDivergent()) {
2086 // add (*_extend (i32 vgpr)), (i64 sgpr)
2087 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2088 if (SDValue ExtLHS = matchExtFromI32orI32(
2089 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2090 SAddr = RHS;
2091 VOffset = ExtLHS;
2092 }
2093 }
2094
2095 if (SAddr) {
2096 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2097 return true;
2098 }
2099 }
2100
2101 if (Subtarget->hasScaleOffset() &&
2102 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2105 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2106 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2107 Addr.getOperand(0)->isDivergent() &&
2109 !Addr.getOperand(2)->isDivergent()) {
2110 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2111 unsigned Size =
2112 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2113 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2114 if (ScaleOffset) {
2115 SAddr = Addr.getOperand(2);
2116 VOffset = Addr.getOperand(0);
2117 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2118 return true;
2119 }
2120 }
2121
2122 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2123 isa<ConstantSDNode>(Addr))
2124 return false;
2125
2126 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2127 // moves required to copy a 64-bit SGPR to VGPR.
2128 SAddr = Addr;
2129 SDNode *VMov =
2130 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2131 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2132 VOffset = SDValue(VMov, 0);
2133 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2134 return true;
2135}
2136
2137bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2138 SDValue &SAddr, SDValue &VOffset,
2139 SDValue &Offset,
2140 SDValue &CPol) const {
2141 bool ScaleOffset;
2142 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2143 return false;
2144
2145 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2146 SDLoc(), MVT::i32);
2147 return true;
2148}
2149
2150bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2151 SDValue &SAddr, SDValue &VOffset,
2152 SDValue &Offset,
2153 SDValue &CPol) const {
2154 bool ScaleOffset;
2155 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2156 return false;
2157
2158 // We are assuming CPol is always the last operand of the intrinsic.
2159 auto PassedCPol =
2160 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2161 CPol = CurDAG->getTargetConstant(
2162 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2163 return true;
2164}
2165
2166bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2167 SDValue &SAddr,
2168 SDValue &VOffset,
2169 SDValue &Offset,
2170 SDValue &CPol) const {
2171 bool ScaleOffset;
2172 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2173 return false;
2174
2175 // We are assuming CPol is second from last operand of the intrinsic.
2176 auto PassedCPol =
2177 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2178 CPol = CurDAG->getTargetConstant(
2179 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2180 return true;
2181}
2182
2183bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2184 SDValue &SAddr, SDValue &VOffset,
2185 SDValue &Offset,
2186 SDValue &CPol) const {
2187 bool ScaleOffset;
2188 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2189 return false;
2190
2191 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2192 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2193 return true;
2194}
2195
2196bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2197 SDValue &SAddr,
2198 SDValue &VOffset,
2199 SDValue &CPol) const {
2200 bool ScaleOffset;
2201 SDValue DummyOffset;
2202 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2203 false))
2204 return false;
2205
2206 // We are assuming CPol is always the last operand of the intrinsic.
2207 auto PassedCPol =
2208 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2209 CPol = CurDAG->getTargetConstant(
2210 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2211 return true;
2212}
2213
2214bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2215 SDValue &SAddr,
2216 SDValue &VOffset,
2217 SDValue &CPol) const {
2218 bool ScaleOffset;
2219 SDValue DummyOffset;
2220 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2221 false))
2222 return false;
2223
2224 // We are assuming CPol is second from last operand of the intrinsic.
2225 auto PassedCPol =
2226 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2227 CPol = CurDAG->getTargetConstant(
2228 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2229 return true;
2230}
2231
2233 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2234 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2235 } else if (SAddr.getOpcode() == ISD::ADD &&
2237 // Materialize this into a scalar move for scalar address to avoid
2238 // readfirstlane.
2239 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2240 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2241 FI->getValueType(0));
2242 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2243 MVT::i32, TFI, SAddr.getOperand(1)),
2244 0);
2245 }
2246
2247 return SAddr;
2248}
2249
2250// Match (32-bit SGPR base) + sext(imm offset)
2251bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2252 SDValue &SAddr,
2253 SDValue &Offset) const {
2254 if (Addr->isDivergent())
2255 return false;
2256
2257 SDLoc DL(Addr);
2258
2259 int64_t COffsetVal = 0;
2260
2261 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2262 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2263 SAddr = Addr.getOperand(0);
2264 } else {
2265 SAddr = Addr;
2266 }
2267
2268 SAddr = SelectSAddrFI(CurDAG, SAddr);
2269
2270 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2271
2272 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2274 int64_t SplitImmOffset, RemainderOffset;
2275 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2277
2278 COffsetVal = SplitImmOffset;
2279
2280 SDValue AddOffset =
2282 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2283 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2284 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2285 SAddr, AddOffset),
2286 0);
2287 }
2288
2289 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2290
2291 return true;
2292}
2293
2294// Check whether the flat scratch SVS swizzle bug affects this access.
2295bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2296 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2297 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2298 return false;
2299
2300 // The bug affects the swizzling of SVS accesses if there is any carry out
2301 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2302 // voffset to (soffset + inst_offset).
2303 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2304 KnownBits SKnown =
2305 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2306 KnownBits::makeConstant(APInt(32, ImmOffset,
2307 /*isSigned=*/true)));
2308 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2309 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2310 return (VMax & 3) + (SMax & 3) >= 4;
2311}
2312
2313bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2314 SDValue &VAddr, SDValue &SAddr,
2315 SDValue &Offset,
2316 SDValue &CPol) const {
2317 int64_t ImmOffset = 0;
2318
2319 SDValue LHS, RHS;
2320 SDValue OrigAddr = Addr;
2321 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2322 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2323 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2324
2325 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2327 Addr = LHS;
2328 ImmOffset = COffsetVal;
2329 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2330 SDLoc SL(N);
2331 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2332 // (large_offset & MaxOffset);
2333 int64_t SplitImmOffset, RemainderOffset;
2334 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2336
2337 if (isUInt<32>(RemainderOffset)) {
2338 SDNode *VMov = CurDAG->getMachineNode(
2339 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2340 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2341 VAddr = SDValue(VMov, 0);
2342 SAddr = LHS;
2343 if (!isFlatScratchBaseLegal(Addr))
2344 return false;
2345 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2346 return false;
2347 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2348 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2349 return true;
2350 }
2351 }
2352 }
2353
2354 if (Addr.getOpcode() != ISD::ADD)
2355 return false;
2356
2357 LHS = Addr.getOperand(0);
2358 RHS = Addr.getOperand(1);
2359
2360 if (!LHS->isDivergent() && RHS->isDivergent()) {
2361 SAddr = LHS;
2362 VAddr = RHS;
2363 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2364 SAddr = RHS;
2365 VAddr = LHS;
2366 } else {
2367 return false;
2368 }
2369
2370 if (OrigAddr != Addr) {
2371 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2372 return false;
2373 } else {
2374 if (!isFlatScratchBaseLegalSV(OrigAddr))
2375 return false;
2376 }
2377
2378 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2379 return false;
2380 SAddr = SelectSAddrFI(CurDAG, SAddr);
2381 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2382
2383 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2384 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2385 SDLoc(), MVT::i32);
2386 return true;
2387}
2388
2389// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2390// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2391// Handle the case where the Immediate Offset + SOffset is negative.
2392bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2393 bool Imm32Only,
2394 bool IsBuffer,
2395 int64_t ImmOffset) const {
2396 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2397 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2398 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2399 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2400 return false;
2401 }
2402
2403 return true;
2404}
2405
2406// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2407// the load byte size. If it is update \p Offset to a pre-scaled value and
2408// return true.
2409bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2410 bool IsSigned) const {
2411 bool ScaleOffset = false;
2412 if (!Subtarget->hasScaleOffset() || !Offset)
2413 return false;
2414
2415 unsigned Size =
2416 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2417
2418 SDValue Off = Offset;
2419 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2420 Off = Ext;
2421
2422 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2423 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2424 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2425 } else if (Offset.getOpcode() == ISD::MUL ||
2426 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2427 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2428 (Offset.isMachineOpcode() &&
2429 Offset.getMachineOpcode() ==
2430 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2431 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2432 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2433 ScaleOffset = C->getZExtValue() == Size;
2434 }
2435
2436 if (ScaleOffset)
2437 Offset = Off.getOperand(0);
2438
2439 return ScaleOffset;
2440}
2441
2442// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2443// not null) offset. If Imm32Only is true, match only 32-bit immediate
2444// offsets available on CI.
2445bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2446 SDValue *SOffset, SDValue *Offset,
2447 bool Imm32Only, bool IsBuffer,
2448 bool HasSOffset, int64_t ImmOffset,
2449 bool *ScaleOffset) const {
2450 assert((!SOffset || !Offset) &&
2451 "Cannot match both soffset and offset at the same time!");
2452
2453 if (ScaleOffset) {
2454 assert(N && SOffset);
2455
2456 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2457 }
2458
2459 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2460 if (!C) {
2461 if (!SOffset)
2462 return false;
2463
2464 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2465 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2466 *SOffset = ByteOffsetNode;
2467 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2468 ImmOffset);
2469 }
2470 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2471 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2472 *SOffset = ByteOffsetNode.getOperand(0);
2473 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2474 ImmOffset);
2475 }
2476 }
2477 return false;
2478 }
2479
2480 SDLoc SL(ByteOffsetNode);
2481
2482 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2483 // offset for S_BUFFER instructions is unsigned.
2484 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2485 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2486 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2487 if (EncodedOffset && Offset && !Imm32Only) {
2488 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2489 return true;
2490 }
2491
2492 // SGPR and literal offsets are unsigned.
2493 if (ByteOffset < 0)
2494 return false;
2495
2496 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2497 if (EncodedOffset && Offset && Imm32Only) {
2498 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2499 return true;
2500 }
2501
2502 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2503 return false;
2504
2505 if (SOffset) {
2506 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2507 *SOffset = SDValue(
2508 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2509 return true;
2510 }
2511
2512 return false;
2513}
2514
2515SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2516 if (Addr.getValueType() != MVT::i32)
2517 return Addr;
2518
2519 // Zero-extend a 32-bit address.
2520 SDLoc SL(Addr);
2521
2522 const MachineFunction &MF = CurDAG->getMachineFunction();
2523 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2524 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2525 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2526
2527 const SDValue Ops[] = {
2528 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2529 Addr,
2530 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2531 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2532 0),
2533 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2534 };
2535
2536 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2537 Ops), 0);
2538}
2539
2540// Match a base and an immediate (if Offset is not null) or an SGPR (if
2541// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2542// true, match only 32-bit immediate offsets available on CI.
2543bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2544 SDValue &SBase, SDValue *SOffset,
2545 SDValue *Offset, bool Imm32Only,
2546 bool IsBuffer, bool HasSOffset,
2547 int64_t ImmOffset,
2548 bool *ScaleOffset) const {
2549 if (SOffset && Offset) {
2550 assert(!Imm32Only && !IsBuffer);
2551 SDValue B;
2552
2553 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2554 return false;
2555
2556 int64_t ImmOff = 0;
2557 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2558 ImmOff = C->getSExtValue();
2559
2560 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2561 true, ImmOff, ScaleOffset);
2562 }
2563
2564 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2565 // wraparound, because s_load instructions perform the addition in 64 bits.
2566 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2567 !Addr->getFlags().hasNoUnsignedWrap())
2568 return false;
2569
2570 SDValue N0, N1;
2571 // Extract the base and offset if possible.
2572 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2573 N0 = Addr.getOperand(0);
2574 N1 = Addr.getOperand(1);
2575 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2576 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2577 }
2578 if (!N0 || !N1)
2579 return false;
2580
2581 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2582 ImmOffset, ScaleOffset)) {
2583 SBase = N0;
2584 return true;
2585 }
2586 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2587 ImmOffset, ScaleOffset)) {
2588 SBase = N1;
2589 return true;
2590 }
2591 return false;
2592}
2593
2594bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2595 SDValue *SOffset, SDValue *Offset,
2596 bool Imm32Only, bool *ScaleOffset) const {
2597 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2598 /* IsBuffer */ false, /* HasSOffset */ false,
2599 /* ImmOffset */ 0, ScaleOffset)) {
2600 SBase = Expand32BitAddress(SBase);
2601 return true;
2602 }
2603
2604 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2605 SBase = Expand32BitAddress(Addr);
2606 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2607 return true;
2608 }
2609
2610 return false;
2611}
2612
2613bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2614 SDValue &Offset) const {
2615 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2616 &Offset);
2617}
2618
2619bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2620 SDValue &Offset) const {
2621 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2622 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2623 &Offset, /* Imm32Only */ true);
2624}
2625
2626bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2627 SDValue &SOffset, SDValue &CPol) const {
2628 bool ScaleOffset;
2629 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2630 /* Imm32Only */ false, &ScaleOffset))
2631 return false;
2632
2633 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2634 SDLoc(N), MVT::i32);
2635 return true;
2636}
2637
2638bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2639 SDValue &SBase, SDValue &SOffset,
2640 SDValue &Offset,
2641 SDValue &CPol) const {
2642 bool ScaleOffset;
2643 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2644 return false;
2645
2646 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2647 SDLoc(N), MVT::i32);
2648 return true;
2649}
2650
2651bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2652 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2653 /* Imm32Only */ false, /* IsBuffer */ true);
2654}
2655
2656bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2657 SDValue &Offset) const {
2658 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2659 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2660 /* Imm32Only */ true, /* IsBuffer */ true);
2661}
2662
2663bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2664 SDValue &Offset) const {
2665 // Match the (soffset + offset) pair as a 32-bit register base and
2666 // an immediate offset.
2667 return N.getValueType() == MVT::i32 &&
2668 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2669 /* SOffset*/ nullptr, &Offset,
2670 /* Imm32Only */ false, /* IsBuffer */ true);
2671}
2672
2673bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2674 SDValue &Base,
2675 SDValue &Offset) const {
2676 SDLoc DL(Index);
2677
2678 if (CurDAG->isBaseWithConstantOffset(Index)) {
2679 SDValue N0 = Index.getOperand(0);
2680 SDValue N1 = Index.getOperand(1);
2681 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2682
2683 // (add n0, c0)
2684 // Don't peel off the offset (c0) if doing so could possibly lead
2685 // the base (n0) to be negative.
2686 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2687 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2688 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2689 Base = N0;
2690 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2691 return true;
2692 }
2693 }
2694
2695 if (isa<ConstantSDNode>(Index))
2696 return false;
2697
2698 Base = Index;
2699 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2700 return true;
2701}
2702
2703SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2704 SDValue Val, uint32_t Offset,
2705 uint32_t Width) {
2706 if (Val->isDivergent()) {
2707 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2708 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2709 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2710
2711 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2712 }
2713 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2714 // Transformation function, pack the offset and width of a BFE into
2715 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2716 // source, bits [5:0] contain the offset and bits [22:16] the width.
2717 uint32_t PackedVal = Offset | (Width << 16);
2718 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2719
2720 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2721}
2722
2723void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2724 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2725 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2726 // Predicate: 0 < b <= c < 32
2727
2728 const SDValue &Shl = N->getOperand(0);
2729 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2730 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2731
2732 if (B && C) {
2733 uint32_t BVal = B->getZExtValue();
2734 uint32_t CVal = C->getZExtValue();
2735
2736 if (0 < BVal && BVal <= CVal && CVal < 32) {
2737 bool Signed = N->getOpcode() == ISD::SRA;
2738 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2739 32 - CVal));
2740 return;
2741 }
2742 }
2743 SelectCode(N);
2744}
2745
2746void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2747 switch (N->getOpcode()) {
2748 case ISD::AND:
2749 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2750 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2751 // Predicate: isMask(mask)
2752 const SDValue &Srl = N->getOperand(0);
2753 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2754 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2755
2756 if (Shift && Mask) {
2757 uint32_t ShiftVal = Shift->getZExtValue();
2758 uint32_t MaskVal = Mask->getZExtValue();
2759
2760 if (isMask_32(MaskVal)) {
2761 uint32_t WidthVal = llvm::popcount(MaskVal);
2762 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2763 WidthVal));
2764 return;
2765 }
2766 }
2767 }
2768 break;
2769 case ISD::SRL:
2770 if (N->getOperand(0).getOpcode() == ISD::AND) {
2771 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2772 // Predicate: isMask(mask >> b)
2773 const SDValue &And = N->getOperand(0);
2774 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2775 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2776
2777 if (Shift && Mask) {
2778 uint32_t ShiftVal = Shift->getZExtValue();
2779 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2780
2781 if (isMask_32(MaskVal)) {
2782 uint32_t WidthVal = llvm::popcount(MaskVal);
2783 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2784 WidthVal));
2785 return;
2786 }
2787 }
2788 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2789 SelectS_BFEFromShifts(N);
2790 return;
2791 }
2792 break;
2793 case ISD::SRA:
2794 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2795 SelectS_BFEFromShifts(N);
2796 return;
2797 }
2798 break;
2799
2801 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2802 SDValue Src = N->getOperand(0);
2803 if (Src.getOpcode() != ISD::SRL)
2804 break;
2805
2806 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2807 if (!Amt)
2808 break;
2809
2810 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2811 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2812 Amt->getZExtValue(), Width));
2813 return;
2814 }
2815 }
2816
2817 SelectCode(N);
2818}
2819
2820bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2821 assert(N->getOpcode() == ISD::BRCOND);
2822 if (!N->hasOneUse())
2823 return false;
2824
2825 SDValue Cond = N->getOperand(1);
2826 if (Cond.getOpcode() == ISD::CopyToReg)
2827 Cond = Cond.getOperand(2);
2828
2829 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2830 return false;
2831
2832 MVT VT = Cond.getOperand(0).getSimpleValueType();
2833 if (VT == MVT::i32)
2834 return true;
2835
2836 if (VT == MVT::i64) {
2837 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2838 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2839 Subtarget->hasScalarCompareEq64();
2840 }
2841
2842 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2843 return true;
2844
2845 return false;
2846}
2847
2848static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2849 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2850 // Special case for amdgcn.ballot:
2851 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2852 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2853 // =>
2854 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2855 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2856 // Cond becomes a i(WaveSize) full mask value.
2857 // Note that ballot doesn't use SETEQ condition but its easy to support it
2858 // here for completeness, so in this case Negate is set true on return.
2859 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2860 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2861 isNullConstant(VCMP.getOperand(1))) {
2862
2863 auto Cond = VCMP.getOperand(0);
2864 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2865 Cond = Cond.getOperand(0);
2866
2867 if (isBoolSGPR(Cond)) {
2868 Negate = VCMP_CC == ISD::SETEQ;
2869 return Cond;
2870 }
2871 }
2872 return SDValue();
2873}
2874
2875void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2876 SDValue Cond = N->getOperand(1);
2877
2878 if (Cond.isUndef()) {
2879 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2880 N->getOperand(2), N->getOperand(0));
2881 return;
2882 }
2883
2884 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2885
2886 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2887 bool AndExec = !UseSCCBr;
2888 bool Negate = false;
2889
2890 if (Cond.getOpcode() == ISD::SETCC &&
2891 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2892 SDValue VCMP = Cond->getOperand(0);
2893 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2894 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2895 isNullConstant(Cond->getOperand(1)) &&
2896 // We may encounter ballot.i64 in wave32 mode on -O0.
2897 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2898 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2899 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2900 // BRCOND i1 %C, %BB
2901 // =>
2902 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2903 // VCC = COPY i(WaveSize) %VCMP
2904 // S_CBRANCH_VCCNZ/VCCZ %BB
2905 Negate = CC == ISD::SETEQ;
2906 bool NegatedBallot = false;
2907 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2908 Cond = BallotCond;
2909 UseSCCBr = !BallotCond->isDivergent();
2910 Negate = Negate ^ NegatedBallot;
2911 } else {
2912 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2913 // selected as V_CMP, but this may change for uniform condition.
2914 Cond = VCMP;
2915 UseSCCBr = false;
2916 }
2917 }
2918 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2919 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2920 // used.
2921 AndExec = false;
2922 }
2923
2924 unsigned BrOp =
2925 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2926 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2927 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2928 SDLoc SL(N);
2929
2930 if (AndExec) {
2931 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2932 // analyzed what generates the vcc value, so we do not know whether vcc
2933 // bits for disabled lanes are 0. Thus we need to mask out bits for
2934 // disabled lanes.
2935 //
2936 // For the case that we select S_CBRANCH_SCC1 and it gets
2937 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2938 // SIInstrInfo::moveToVALU which inserts the S_AND).
2939 //
2940 // We could add an analysis of what generates the vcc value here and omit
2941 // the S_AND when is unnecessary. But it would be better to add a separate
2942 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2943 // catches both cases.
2944 Cond = SDValue(
2945 CurDAG->getMachineNode(
2946 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2947 MVT::i1,
2948 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2949 : AMDGPU::EXEC,
2950 MVT::i1),
2951 Cond),
2952 0);
2953 }
2954
2955 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2956 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2957 N->getOperand(2), // Basic Block
2958 VCC.getValue(0));
2959}
2960
2961void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2962 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2963 !N->isDivergent()) {
2964 SDValue Src = N->getOperand(0);
2965 if (Src.getValueType() == MVT::f16) {
2966 if (isExtractHiElt(Src, Src)) {
2967 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2968 {Src});
2969 return;
2970 }
2971 }
2972 }
2973
2974 SelectCode(N);
2975}
2976
2977void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2978 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2979 // be copied to an SGPR with readfirstlane.
2980 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2981 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2982
2983 SDValue Chain = N->getOperand(0);
2984 SDValue Ptr = N->getOperand(2);
2985 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2986 MachineMemOperand *MMO = M->getMemOperand();
2987 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2988
2990 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2991 SDValue PtrBase = Ptr.getOperand(0);
2992 SDValue PtrOffset = Ptr.getOperand(1);
2993
2994 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2995 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2996 N = glueCopyToM0(N, PtrBase);
2997 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2998 }
2999 }
3000
3001 if (!Offset) {
3002 N = glueCopyToM0(N, Ptr);
3003 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
3004 }
3005
3006 SDValue Ops[] = {
3007 Offset,
3008 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
3009 Chain,
3010 N->getOperand(N->getNumOperands() - 1) // New glue
3011 };
3012
3013 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3014 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3015}
3016
3017// We need to handle this here because tablegen doesn't support matching
3018// instructions with multiple outputs.
3019void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
3020 unsigned Opc;
3021 switch (IntrID) {
3022 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3023 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3024 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
3025 break;
3026 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3027 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
3028 break;
3029 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3030 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
3031 break;
3032 }
3033 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
3034 N->getOperand(5), N->getOperand(0)};
3035
3036 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3037 MachineMemOperand *MMO = M->getMemOperand();
3038 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3039 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3040}
3041
3042void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3043 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3044 unsigned Opc =
3045 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3046
3047 SmallVector<SDValue, 7> TensorOps;
3048 // First two groups
3049 TensorOps.push_back(N->getOperand(2)); // D# group 0
3050 TensorOps.push_back(N->getOperand(3)); // D# group 1
3051
3052 // Use _D2 version if both group 2 and 3 are zero-initialized.
3053 SDValue Group2 = N->getOperand(4);
3054 SDValue Group3 = N->getOperand(5);
3055 if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3057 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3058 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3059 } else { // Has at least 4 groups
3060 TensorOps.push_back(Group2); // D# group 2
3061 TensorOps.push_back(Group3); // D# group 3
3062 }
3063
3064 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3065 // for now because all existing targets only support up to 4 groups.
3066 TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3067 TensorOps.push_back(N->getOperand(7)); // cache policy
3068 TensorOps.push_back(N->getOperand(0)); // chain
3069
3070 (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3071}
3072
3073static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3074 switch (IntrID) {
3075 case Intrinsic::amdgcn_ds_gws_init:
3076 return AMDGPU::DS_GWS_INIT;
3077 case Intrinsic::amdgcn_ds_gws_barrier:
3078 return AMDGPU::DS_GWS_BARRIER;
3079 case Intrinsic::amdgcn_ds_gws_sema_v:
3080 return AMDGPU::DS_GWS_SEMA_V;
3081 case Intrinsic::amdgcn_ds_gws_sema_br:
3082 return AMDGPU::DS_GWS_SEMA_BR;
3083 case Intrinsic::amdgcn_ds_gws_sema_p:
3084 return AMDGPU::DS_GWS_SEMA_P;
3085 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3086 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3087 default:
3088 llvm_unreachable("not a gws intrinsic");
3089 }
3090}
3091
3092void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3093 if (!Subtarget->hasGWS() ||
3094 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3095 !Subtarget->hasGWSSemaReleaseAll())) {
3096 // Let this error.
3097 SelectCode(N);
3098 return;
3099 }
3100
3101 // Chain, intrinsic ID, vsrc, offset
3102 const bool HasVSrc = N->getNumOperands() == 4;
3103 assert(HasVSrc || N->getNumOperands() == 3);
3104
3105 SDLoc SL(N);
3106 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3107 int ImmOffset = 0;
3108 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3109 MachineMemOperand *MMO = M->getMemOperand();
3110
3111 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3112 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3113
3114 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3115 // offset field) % 64. Some versions of the programming guide omit the m0
3116 // part, or claim it's from offset 0.
3117 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3118 // If we have a constant offset, try to use the 0 in m0 as the base.
3119 // TODO: Look into changing the default m0 initialization value. If the
3120 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3121 // the immediate offset.
3122 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3123 ImmOffset = ConstOffset->getZExtValue();
3124 } else {
3125 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3126 ImmOffset = BaseOffset.getConstantOperandVal(1);
3127 BaseOffset = BaseOffset.getOperand(0);
3128 }
3129
3130 // Prefer to do the shift in an SGPR since it should be possible to use m0
3131 // as the result directly. If it's already an SGPR, it will be eliminated
3132 // later.
3133 SDNode *SGPROffset
3134 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3135 BaseOffset);
3136 // Shift to offset in m0
3137 SDNode *M0Base
3138 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3139 SDValue(SGPROffset, 0),
3140 CurDAG->getTargetConstant(16, SL, MVT::i32));
3141 glueCopyToM0(N, SDValue(M0Base, 0));
3142 }
3143
3144 SDValue Chain = N->getOperand(0);
3145 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3146
3147 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3148
3149 const MCInstrDesc &InstrDesc = TII->get(Opc);
3150 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3151
3152 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3153
3155 if (HasVSrc) {
3156 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3157
3158 SDValue Data = N->getOperand(2);
3159 MVT DataVT = Data.getValueType().getSimpleVT();
3160 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3161 // Normal 32-bit case.
3162 Ops.push_back(N->getOperand(2));
3163 } else {
3164 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3165 // even aligned 64-bit register class.
3166 const SDValue RegSeqOps[] = {
3167 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3168 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3169 SDValue(
3170 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3171 0),
3172 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3173
3174 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3175 SL, MVT::v2i32, RegSeqOps),
3176 0));
3177 }
3178 }
3179
3180 Ops.push_back(OffsetField);
3181 Ops.push_back(Chain);
3182
3183 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3184 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3185}
3186
3187void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3188 if (Subtarget->getLDSBankCount() != 16) {
3189 // This is a single instruction with a pattern.
3190 SelectCode(N);
3191 return;
3192 }
3193
3194 SDLoc DL(N);
3195
3196 // This requires 2 instructions. It is possible to write a pattern to support
3197 // this, but the generated isel emitter doesn't correctly deal with multiple
3198 // output instructions using the same physical register input. The copy to m0
3199 // is incorrectly placed before the second instruction.
3200 //
3201 // TODO: Match source modifiers.
3202 //
3203 // def : Pat <
3204 // (int_amdgcn_interp_p1_f16
3205 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3206 // (i32 timm:$attrchan), (i32 timm:$attr),
3207 // (i1 timm:$high), M0),
3208 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3209 // timm:$attrchan, 0,
3210 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3211 // let Predicates = [has16BankLDS];
3212 // }
3213
3214 // 16 bank LDS
3215 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3216 N->getOperand(5), SDValue());
3217
3218 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3219
3220 SDNode *InterpMov =
3221 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3222 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3223 N->getOperand(3), // Attr
3224 N->getOperand(2), // Attrchan
3225 ToM0.getValue(1) // In glue
3226 });
3227
3228 SDNode *InterpP1LV =
3229 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3230 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3231 N->getOperand(1), // Src0
3232 N->getOperand(3), // Attr
3233 N->getOperand(2), // Attrchan
3234 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3235 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3236 N->getOperand(4), // high
3237 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3238 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3239 SDValue(InterpMov, 1)
3240 });
3241
3242 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3243}
3244
3245void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3246 unsigned IntrID = N->getConstantOperandVal(1);
3247 switch (IntrID) {
3248 case Intrinsic::amdgcn_ds_append:
3249 case Intrinsic::amdgcn_ds_consume: {
3250 if (N->getValueType(0) != MVT::i32)
3251 break;
3252 SelectDSAppendConsume(N, IntrID);
3253 return;
3254 }
3255 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3256 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3257 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3258 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3259 SelectDSBvhStackIntrinsic(N, IntrID);
3260 return;
3261 case Intrinsic::amdgcn_init_whole_wave:
3262 CurDAG->getMachineFunction()
3263 .getInfo<SIMachineFunctionInfo>()
3264 ->setInitWholeWave();
3265 break;
3266 }
3267
3268 SelectCode(N);
3269}
3270
3271void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3272 unsigned IntrID = N->getConstantOperandVal(0);
3273 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3274 SDNode *ConvGlueNode = N->getGluedNode();
3275 if (ConvGlueNode) {
3276 // FIXME: Possibly iterate over multiple glue nodes?
3277 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3278 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3279 ConvGlueNode =
3280 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3281 MVT::Glue, SDValue(ConvGlueNode, 0));
3282 } else {
3283 ConvGlueNode = nullptr;
3284 }
3285 switch (IntrID) {
3286 case Intrinsic::amdgcn_wqm:
3287 Opcode = AMDGPU::WQM;
3288 break;
3289 case Intrinsic::amdgcn_softwqm:
3290 Opcode = AMDGPU::SOFT_WQM;
3291 break;
3292 case Intrinsic::amdgcn_wwm:
3293 case Intrinsic::amdgcn_strict_wwm:
3294 Opcode = AMDGPU::STRICT_WWM;
3295 break;
3296 case Intrinsic::amdgcn_strict_wqm:
3297 Opcode = AMDGPU::STRICT_WQM;
3298 break;
3299 case Intrinsic::amdgcn_interp_p1_f16:
3300 SelectInterpP1F16(N);
3301 return;
3302 case Intrinsic::amdgcn_permlane16_swap:
3303 case Intrinsic::amdgcn_permlane32_swap: {
3304 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3305 !Subtarget->hasPermlane16Swap()) ||
3306 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3307 !Subtarget->hasPermlane32Swap())) {
3308 SelectCode(N); // Hit the default error
3309 return;
3310 }
3311
3312 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3313 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3314 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3315
3316 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3317 if (ConvGlueNode)
3318 NewOps.push_back(SDValue(ConvGlueNode, 0));
3319
3320 bool FI = N->getConstantOperandVal(3);
3321 NewOps[2] = CurDAG->getTargetConstant(
3322 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3323
3324 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3325 return;
3326 }
3327 default:
3328 SelectCode(N);
3329 break;
3330 }
3331
3332 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3333 SDValue Src = N->getOperand(1);
3334 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3335 }
3336
3337 if (ConvGlueNode) {
3338 SmallVector<SDValue, 4> NewOps(N->ops());
3339 NewOps.push_back(SDValue(ConvGlueNode, 0));
3340 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3341 }
3342}
3343
3344void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3345 unsigned IntrID = N->getConstantOperandVal(1);
3346 switch (IntrID) {
3347 case Intrinsic::amdgcn_ds_gws_init:
3348 case Intrinsic::amdgcn_ds_gws_barrier:
3349 case Intrinsic::amdgcn_ds_gws_sema_v:
3350 case Intrinsic::amdgcn_ds_gws_sema_br:
3351 case Intrinsic::amdgcn_ds_gws_sema_p:
3352 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3353 SelectDS_GWS(N, IntrID);
3354 return;
3355 case Intrinsic::amdgcn_tensor_load_to_lds:
3356 case Intrinsic::amdgcn_tensor_store_from_lds:
3357 SelectTensorLoadStore(N, IntrID);
3358 return;
3359 default:
3360 break;
3361 }
3362
3363 SelectCode(N);
3364}
3365
3366void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3367 SDValue Log2WaveSize =
3368 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3369 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3370 {N->getOperand(0), Log2WaveSize});
3371}
3372
3373void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3374 SDValue SrcVal = N->getOperand(1);
3375 if (SrcVal.getValueType() != MVT::i32) {
3376 SelectCode(N); // Emit default error
3377 return;
3378 }
3379
3380 SDValue CopyVal;
3381 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3382 SDLoc SL(N);
3383
3384 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3385 CopyVal = SrcVal.getOperand(0);
3386 } else {
3387 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3388 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3389
3390 if (N->isDivergent()) {
3391 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3392 MVT::i32, SrcVal),
3393 0);
3394 }
3395
3396 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3397 {SrcVal, Log2WaveSize}),
3398 0);
3399 }
3400
3401 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3402 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3403}
3404
3405bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3406 unsigned &Mods,
3407 bool IsCanonicalizing,
3408 bool AllowAbs) const {
3409 Mods = SISrcMods::NONE;
3410 Src = In;
3411
3412 if (Src.getOpcode() == ISD::FNEG) {
3413 Mods |= SISrcMods::NEG;
3414 Src = Src.getOperand(0);
3415 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3416 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3417 // denormal mode, but we're implicitly canonicalizing in a source operand.
3418 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3419 if (LHS && LHS->isZero()) {
3420 Mods |= SISrcMods::NEG;
3421 Src = Src.getOperand(1);
3422 }
3423 }
3424
3425 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3426 Mods |= SISrcMods::ABS;
3427 Src = Src.getOperand(0);
3428 }
3429
3430 if (Mods != SISrcMods::NONE)
3431 return true;
3432
3433 // Convert various sign-bit masks on integers to src mods. Currently disabled
3434 // for 16-bit types as the codegen replaces the operand without adding a
3435 // srcmod. This is intentionally finding the cases where we are performing
3436 // float neg and abs on int types, the goal is not to obtain two's complement
3437 // neg or abs. Limit converison to select operands via the nonCanonalizing
3438 // pattern.
3439 // TODO: Add 16-bit support.
3440 if (IsCanonicalizing)
3441 return true;
3442
3443 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3444 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3445 // through the extract to the bitwise op.
3446 SDValue PeekSrc =
3447 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3448 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3449 // types as the codegen replaces the operand without adding a srcmod.
3450 // This is intentionally finding the cases where we are performing float neg
3451 // and abs on int types, the goal is not to obtain two's complement neg or
3452 // abs.
3453 // TODO: Add 16-bit support.
3454 unsigned Opc = PeekSrc.getOpcode();
3455 EVT VT = Src.getValueType();
3456 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3457 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3458 return true;
3459
3460 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3461 if (!CRHS)
3462 return true;
3463
3464 auto ReplaceSrc = [&]() -> SDValue {
3465 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3466 return Src.getOperand(0);
3467
3468 SDValue LHS = PeekSrc->getOperand(0);
3469 SDValue Index = Src->getOperand(1);
3470 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3471 Src.getValueType(), LHS, Index);
3472 };
3473
3474 // Recognise Srcmods:
3475 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3476 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3477 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3478 // SrcModifiers.
3479 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3480 Mods |= SISrcMods::NEG;
3481 Src = ReplaceSrc();
3482 } else if (Opc == ISD::AND && AllowAbs &&
3483 CRHS->getAPIntValue().isMaxSignedValue()) {
3484 Mods |= SISrcMods::ABS;
3485 Src = ReplaceSrc();
3486 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3488 Src = ReplaceSrc();
3489 }
3490
3491 return true;
3492}
3493
3494bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3495 SDValue &SrcMods) const {
3496 unsigned Mods;
3497 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3498 /*AllowAbs=*/true)) {
3499 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3500 return true;
3501 }
3502
3503 return false;
3504}
3505
3506bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3507 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3508 unsigned Mods;
3509 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3510 /*AllowAbs=*/true)) {
3511 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3512 return true;
3513 }
3514
3515 return false;
3516}
3517
3518bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3519 SDValue &SrcMods) const {
3520 unsigned Mods;
3521 if (SelectVOP3ModsImpl(In, Src, Mods,
3522 /*IsCanonicalizing=*/true,
3523 /*AllowAbs=*/false)) {
3524 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3525 return true;
3526 }
3527
3528 return false;
3529}
3530
3531bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3532 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3533 return false;
3534
3535 Src = In;
3536 return true;
3537}
3538
3539bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3540 SDValue &SrcMods,
3541 bool OpSel) const {
3542 unsigned Mods;
3543 if (SelectVOP3ModsImpl(In, Src, Mods,
3544 /*IsCanonicalizing=*/true,
3545 /*AllowAbs=*/false)) {
3546 if (OpSel)
3547 Mods |= SISrcMods::OP_SEL_0;
3548 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3549 return true;
3550 }
3551
3552 return false;
3553}
3554
3555bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3556 SDValue &SrcMods) const {
3557 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3558}
3559
3560bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3561 SDValue &SrcMods) const {
3562 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3563}
3564
3565bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3566 SDValue &SrcMods, SDValue &Clamp,
3567 SDValue &Omod) const {
3568 SDLoc DL(In);
3569 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3570 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3571
3572 return SelectVOP3Mods(In, Src, SrcMods);
3573}
3574
3575bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3576 SDValue &SrcMods, SDValue &Clamp,
3577 SDValue &Omod) const {
3578 SDLoc DL(In);
3579 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3580 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3581
3582 return SelectVOP3BMods(In, Src, SrcMods);
3583}
3584
3585bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3586 SDValue &Clamp, SDValue &Omod) const {
3587 Src = In;
3588
3589 SDLoc DL(In);
3590 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3591 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3592
3593 return true;
3594}
3595
3596bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3597 SDValue &SrcMods, bool IsDOT) const {
3598 unsigned Mods = SISrcMods::NONE;
3599 Src = In;
3600
3601 // TODO: Handle G_FSUB 0 as fneg
3602 if (Src.getOpcode() == ISD::FNEG) {
3604 Src = Src.getOperand(0);
3605 }
3606
3607 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3608 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3609 unsigned VecMods = Mods;
3610
3611 SDValue Lo = stripBitcast(Src.getOperand(0));
3612 SDValue Hi = stripBitcast(Src.getOperand(1));
3613
3614 if (Lo.getOpcode() == ISD::FNEG) {
3615 Lo = stripBitcast(Lo.getOperand(0));
3616 Mods ^= SISrcMods::NEG;
3617 }
3618
3619 if (Hi.getOpcode() == ISD::FNEG) {
3620 Hi = stripBitcast(Hi.getOperand(0));
3621 Mods ^= SISrcMods::NEG_HI;
3622 }
3623
3624 if (isExtractHiElt(Lo, Lo))
3625 Mods |= SISrcMods::OP_SEL_0;
3626
3627 if (isExtractHiElt(Hi, Hi))
3628 Mods |= SISrcMods::OP_SEL_1;
3629
3630 unsigned VecSize = Src.getValueSizeInBits();
3631 Lo = stripExtractLoElt(Lo);
3632 Hi = stripExtractLoElt(Hi);
3633
3634 if (Lo.getValueSizeInBits() > VecSize) {
3635 Lo = CurDAG->getTargetExtractSubreg(
3636 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3637 MVT::getIntegerVT(VecSize), Lo);
3638 }
3639
3640 if (Hi.getValueSizeInBits() > VecSize) {
3641 Hi = CurDAG->getTargetExtractSubreg(
3642 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3643 MVT::getIntegerVT(VecSize), Hi);
3644 }
3645
3646 assert(Lo.getValueSizeInBits() <= VecSize &&
3647 Hi.getValueSizeInBits() <= VecSize);
3648
3649 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3650 // Really a scalar input. Just select from the low half of the register to
3651 // avoid packing.
3652
3653 if (VecSize == Lo.getValueSizeInBits()) {
3654 Src = Lo;
3655 } else if (VecSize == 32) {
3656 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3657 } else {
3658 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3659
3660 SDLoc SL(In);
3662 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3663 Lo.getValueType()), 0);
3664 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3665 : AMDGPU::SReg_64RegClassID;
3666 const SDValue Ops[] = {
3667 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3668 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3669 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3670
3671 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3672 Src.getValueType(), Ops), 0);
3673 }
3674 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3675 return true;
3676 }
3677
3678 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3679 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3680 .bitcastToAPInt().getZExtValue();
3681 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3682 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3683 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3684 return true;
3685 }
3686 }
3687
3688 Mods = VecMods;
3689 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3690 Src.getNumOperands() == 2) {
3691
3692 // TODO: We should repeat the build_vector source check above for the
3693 // vector_shuffle for negates and casts of individual elements.
3694
3695 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3696 ArrayRef<int> Mask = SVN->getMask();
3697
3698 if (Mask[0] < 2 && Mask[1] < 2) {
3699 // src1 should be undef.
3700 SDValue ShuffleSrc = SVN->getOperand(0);
3701
3702 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3703 ShuffleSrc = ShuffleSrc.getOperand(0);
3705 }
3706
3707 if (Mask[0] == 1)
3708 Mods |= SISrcMods::OP_SEL_0;
3709 if (Mask[1] == 1)
3710 Mods |= SISrcMods::OP_SEL_1;
3711
3712 Src = ShuffleSrc;
3713 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3714 return true;
3715 }
3716 }
3717
3718 // Packed instructions do not have abs modifiers.
3719 Mods |= SISrcMods::OP_SEL_1;
3720
3721 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3722 return true;
3723}
3724
3725bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3726 SDValue &SrcMods) const {
3727 return SelectVOP3PMods(In, Src, SrcMods, true);
3728}
3729
3730bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
3731 SDValue SrcTmp, SrcModsTmp;
3732 SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);
3733 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3734 Src = SrcTmp;
3735 return true;
3736 }
3737
3738 return false;
3739}
3740
3741bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
3742 SDValue &SrcMods) const {
3743 SelectVOP3Mods(In, Src, SrcMods);
3744 unsigned Mods = SISrcMods::OP_SEL_1;
3745 Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();
3746 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3747 return true;
3748}
3749
3750bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
3751 SDValue SrcTmp, SrcModsTmp;
3752 SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);
3753 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3754 Src = SrcTmp;
3755 return true;
3756 }
3757
3758 return false;
3759}
3760
3761bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3762 SDValue &Src) const {
3763 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3764 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3765
3766 unsigned Mods = SISrcMods::OP_SEL_1;
3767 unsigned SrcVal = C->getZExtValue();
3768 if (SrcVal == 1)
3769 Mods |= SISrcMods::OP_SEL_0;
3770
3771 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3772 return true;
3773}
3774
3776AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3777 const SDLoc &DL) const {
3778 unsigned DstRegClass;
3779 EVT DstTy;
3780 switch (Elts.size()) {
3781 case 8:
3782 DstRegClass = AMDGPU::VReg_256RegClassID;
3783 DstTy = MVT::v8i32;
3784 break;
3785 case 4:
3786 DstRegClass = AMDGPU::VReg_128RegClassID;
3787 DstTy = MVT::v4i32;
3788 break;
3789 case 2:
3790 DstRegClass = AMDGPU::VReg_64RegClassID;
3791 DstTy = MVT::v2i32;
3792 break;
3793 default:
3794 llvm_unreachable("unhandled Reg sequence size");
3795 }
3796
3798 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3799 for (unsigned i = 0; i < Elts.size(); ++i) {
3800 Ops.push_back(Elts[i]);
3801 Ops.push_back(CurDAG->getTargetConstant(
3803 }
3804 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3805}
3806
3808AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3809 const SDLoc &DL) const {
3810 SmallVector<SDValue, 8> PackedElts;
3811 assert("unhandled Reg sequence size" &&
3812 (Elts.size() == 8 || Elts.size() == 16));
3813
3814 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3815 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3816 for (unsigned i = 0; i < Elts.size(); i += 2) {
3817 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3818 SDValue HiSrc;
3819 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3820 PackedElts.push_back(HiSrc);
3821 } else {
3822 if (Subtarget->useRealTrue16Insts()) {
3823 // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before
3824 // passing to v_perm_b32. Eventually we should use replace v_perm_b32
3825 // by reg_sequence.
3827 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i16),
3828 0);
3829 Elts[i] =
3830 emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID, MVT::i32,
3831 {Elts[i], Undef}, {AMDGPU::lo16, AMDGPU::hi16}, DL);
3832 Elts[i + 1] = emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID,
3833 MVT::i32, {Elts[i + 1], Undef},
3834 {AMDGPU::lo16, AMDGPU::hi16}, DL);
3835 }
3836 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3837 MachineSDNode *Packed =
3838 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3839 {Elts[i + 1], Elts[i], PackLoLo});
3840 PackedElts.push_back(SDValue(Packed, 0));
3841 }
3842 }
3843 return buildRegSequence32(PackedElts, DL);
3844}
3845
3847AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3848 const SDLoc &DL,
3849 unsigned ElementSize) const {
3850 if (ElementSize == 16)
3851 return buildRegSequence16(Elts, DL);
3852 if (ElementSize == 32)
3853 return buildRegSequence32(Elts, DL);
3854 llvm_unreachable("Unhandled element size");
3855}
3856
3857void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,
3858 unsigned &Mods,
3860 SDValue &Src, const SDLoc &DL,
3861 unsigned ElementSize) const {
3862 if (ModOpcode == ISD::FNEG) {
3863 Mods |= SISrcMods::NEG;
3864 // Check if all elements also have abs modifier
3865 SmallVector<SDValue, 8> NegAbsElts;
3866 for (auto El : Elts) {
3867 if (El.getOpcode() != ISD::FABS)
3868 break;
3869 NegAbsElts.push_back(El->getOperand(0));
3870 }
3871 if (Elts.size() != NegAbsElts.size()) {
3872 // Neg
3873 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3874 } else {
3875 // Neg and Abs
3876 Mods |= SISrcMods::NEG_HI;
3877 Src = SDValue(buildRegSequence(NegAbsElts, DL, ElementSize), 0);
3878 }
3879 } else {
3880 assert(ModOpcode == ISD::FABS);
3881 // Abs
3882 Mods |= SISrcMods::NEG_HI;
3883 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3884 }
3885}
3886
3887// Check all f16 elements for modifiers while looking through b32 and v2b16
3888// build vector, stop if element does not satisfy ModifierCheck.
3889static void
3891 std::function<bool(SDValue)> ModifierCheck) {
3892 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3893 if (auto *F16Pair =
3894 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3895 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3896 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3897 if (!ModifierCheck(ElF16))
3898 break;
3899 }
3900 }
3901 }
3902}
3903
3904bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3905 SDValue &SrcMods) const {
3906 Src = In;
3907 unsigned Mods = SISrcMods::OP_SEL_1;
3908
3909 // mods are on f16 elements
3910 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3912
3913 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3914 if (Element.getOpcode() != ISD::FNEG)
3915 return false;
3916 EltsF16.push_back(Element.getOperand(0));
3917 return true;
3918 });
3919
3920 // All elements have neg modifier
3921 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3922 Src = SDValue(buildRegSequence16(EltsF16, SDLoc(In)), 0);
3923 Mods |= SISrcMods::NEG;
3924 Mods |= SISrcMods::NEG_HI;
3925 }
3926 }
3927
3928 // mods are on v2f16 elements
3929 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3930 SmallVector<SDValue, 8> EltsV2F16;
3931 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3932 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3933 // Based on first element decide which mod we match, neg or abs
3934 if (ElV2f16.getOpcode() != ISD::FNEG)
3935 break;
3936 EltsV2F16.push_back(ElV2f16.getOperand(0));
3937 }
3938
3939 // All pairs of elements have neg modifier
3940 if (BV->getNumOperands() == EltsV2F16.size()) {
3941 Src = SDValue(buildRegSequence32(EltsV2F16, SDLoc(In)), 0);
3942 Mods |= SISrcMods::NEG;
3943 Mods |= SISrcMods::NEG_HI;
3944 }
3945 }
3946
3947 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3948 return true;
3949}
3950
3951bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3952 SDValue &SrcMods) const {
3953 Src = In;
3954 unsigned Mods = SISrcMods::OP_SEL_1;
3955 unsigned ModOpcode;
3956
3957 // mods are on f16 elements
3958 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3960 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3961 // Based on first element decide which mod we match, neg or abs
3962 if (EltsF16.empty())
3963 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3964 if (ElF16.getOpcode() != ModOpcode)
3965 return false;
3966 EltsF16.push_back(ElF16.getOperand(0));
3967 return true;
3968 });
3969
3970 // All elements have ModOpcode modifier
3971 if (BV->getNumOperands() * 2 == EltsF16.size())
3972 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, SDLoc(In), 16);
3973 }
3974
3975 // mods are on v2f16 elements
3976 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3977 SmallVector<SDValue, 8> EltsV2F16;
3978
3979 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3980 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3981 // Based on first element decide which mod we match, neg or abs
3982 if (EltsV2F16.empty())
3983 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3984 if (ElV2f16->getOpcode() != ModOpcode)
3985 break;
3986 EltsV2F16.push_back(ElV2f16->getOperand(0));
3987 }
3988
3989 // All elements have ModOpcode modifier
3990 if (BV->getNumOperands() == EltsV2F16.size())
3991 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, SDLoc(In), 32);
3992 }
3993
3994 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3995 return true;
3996}
3997
3998bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3999 SDValue &SrcMods) const {
4000 Src = In;
4001 unsigned Mods = SISrcMods::OP_SEL_1;
4003
4004 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4005 assert(BV->getNumOperands() > 0);
4006 // Based on first element decide which mod we match, neg or abs
4007 SDValue ElF32 = stripBitcast(BV->getOperand(0));
4008 unsigned ModOpcode =
4009 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4010 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4011 SDValue ElF32 = stripBitcast(BV->getOperand(i));
4012 if (ElF32.getOpcode() != ModOpcode)
4013 break;
4014 EltsF32.push_back(ElF32.getOperand(0));
4015 }
4016
4017 // All elements had ModOpcode modifier
4018 if (BV->getNumOperands() == EltsF32.size())
4019 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, SDLoc(In), 32);
4020 }
4021
4022 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4023 return true;
4024}
4025
4026bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
4027 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
4028 BitVector UndefElements;
4029 if (SDValue Splat = BV->getSplatValue(&UndefElements))
4030 if (isInlineImmediate(Splat.getNode())) {
4031 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
4032 unsigned Imm = C->getAPIntValue().getSExtValue();
4033 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4034 return true;
4035 }
4036 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
4037 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
4038 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4039 return true;
4040 }
4041 llvm_unreachable("unhandled Constant node");
4042 }
4043 }
4044
4045 // 16 bit splat
4046 SDValue SplatSrc32 = stripBitcast(In);
4047 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
4048 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
4049 SDValue SplatSrc16 = stripBitcast(Splat32);
4050 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
4051 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
4052 const SIInstrInfo *TII = Subtarget->getInstrInfo();
4053 std::optional<APInt> RawValue;
4054 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
4055 RawValue = C->getValueAPF().bitcastToAPInt();
4056 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
4057 RawValue = C->getAPIntValue();
4058
4059 if (RawValue.has_value()) {
4060 EVT VT = In.getValueType().getScalarType();
4061 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
4062 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
4065 RawValue.value());
4066 if (TII->isInlineConstant(FloatVal)) {
4067 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4068 MVT::i16);
4069 return true;
4070 }
4071 } else if (VT.getSimpleVT() == MVT::i16) {
4072 if (TII->isInlineConstant(RawValue.value())) {
4073 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4074 MVT::i16);
4075 return true;
4076 }
4077 } else
4078 llvm_unreachable("unknown 16-bit type");
4079 }
4080 }
4081 }
4082
4083 return false;
4084}
4085
4086bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4087 SDValue &IndexKey) const {
4088 unsigned Key = 0;
4089 Src = In;
4090
4091 if (In.getOpcode() == ISD::SRL) {
4092 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4093 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4094 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4095 ShiftAmt->getZExtValue() % 8 == 0) {
4096 Key = ShiftAmt->getZExtValue() / 8;
4097 Src = ShiftSrc;
4098 }
4099 }
4100
4101 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4102 return true;
4103}
4104
4105bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4106 SDValue &IndexKey) const {
4107 unsigned Key = 0;
4108 Src = In;
4109
4110 if (In.getOpcode() == ISD::SRL) {
4111 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4112 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4113 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4114 ShiftAmt->getZExtValue() == 16) {
4115 Key = 1;
4116 Src = ShiftSrc;
4117 }
4118 }
4119
4120 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4121 return true;
4122}
4123
4124bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4125 SDValue &IndexKey) const {
4126 unsigned Key = 0;
4127 Src = In;
4128
4129 SDValue InI32;
4130
4131 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4132 const SDValue &ExtendSrc = In.getOperand(0);
4133 if (ExtendSrc.getValueSizeInBits() == 32)
4134 InI32 = ExtendSrc;
4135 } else if (In->getOpcode() == ISD::BITCAST) {
4136 const SDValue &CastSrc = In.getOperand(0);
4137 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4138 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4139 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4140 if (Zero && Zero->getZExtValue() == 0)
4141 InI32 = CastSrc.getOperand(0);
4142 }
4143 }
4144
4145 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4146 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4147 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4148 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4149 EltIdx->getZExtValue() == 1) {
4150 Key = 1;
4151 Src = ExtractVecEltSrc;
4152 }
4153 }
4154
4155 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4156 return true;
4157}
4158
4159bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4160 SDValue &SrcMods) const {
4161 Src = In;
4162 // FIXME: Handle op_sel
4163 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4164 return true;
4165}
4166
4167bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4168 SDValue &SrcMods) const {
4169 // FIXME: Handle op_sel
4170 return SelectVOP3Mods(In, Src, SrcMods);
4171}
4172
4173// Match lowered fpext from bf16 to f32. This is a bit operation extending
4174// a 16-bit value with 16-bit of zeroes at LSB:
4175//
4176// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4177// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4178// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4179static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4180 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4181 return SDValue();
4182 Op = Op.getOperand(0);
4183
4184 IsExtractHigh = false;
4185 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4186 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4187 if (!Low16 || !Low16->isZero())
4188 return SDValue();
4189 Op = stripBitcast(Op.getOperand(1));
4190 if (Op.getValueType() != MVT::bf16)
4191 return SDValue();
4192 return Op;
4193 }
4194
4195 if (Op.getValueType() != MVT::i32)
4196 return SDValue();
4197
4198 if (Op.getOpcode() == ISD::AND) {
4199 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4200 if (Mask->getZExtValue() == 0xffff0000) {
4201 IsExtractHigh = true;
4202 return Op.getOperand(0);
4203 }
4204 }
4205 return SDValue();
4206 }
4207
4208 if (Op.getOpcode() == ISD::SHL) {
4209 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4210 if (Amt->getZExtValue() == 16)
4211 return Op.getOperand(0);
4212 }
4213 }
4214
4215 return SDValue();
4216}
4217
4218// The return value is not whether the match is possible (which it always is),
4219// but whether or not it a conversion is really used.
4220bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4221 unsigned &Mods,
4222 MVT VT) const {
4223 Mods = 0;
4224 SelectVOP3ModsImpl(In, Src, Mods);
4225
4226 bool IsExtractHigh = false;
4227 if (Src.getOpcode() == ISD::FP_EXTEND) {
4228 Src = Src.getOperand(0);
4229 } else if (VT == MVT::bf16) {
4230 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4231 if (!B16)
4232 return false;
4233 Src = B16;
4234 } else
4235 return false;
4236
4237 if (Src.getValueType() != VT &&
4238 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4239 return false;
4240
4241 Src = stripBitcast(Src);
4242
4243 // Be careful about folding modifiers if we already have an abs. fneg is
4244 // applied last, so we don't want to apply an earlier fneg.
4245 if ((Mods & SISrcMods::ABS) == 0) {
4246 unsigned ModsTmp;
4247 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4248
4249 if ((ModsTmp & SISrcMods::NEG) != 0)
4250 Mods ^= SISrcMods::NEG;
4251
4252 if ((ModsTmp & SISrcMods::ABS) != 0)
4253 Mods |= SISrcMods::ABS;
4254 }
4255
4256 // op_sel/op_sel_hi decide the source type and source.
4257 // If the source's op_sel_hi is set, it indicates to do a conversion from
4258 // fp16. If the sources's op_sel is set, it picks the high half of the source
4259 // register.
4260
4261 Mods |= SISrcMods::OP_SEL_1;
4262 if (Src.getValueSizeInBits() == 16) {
4263 if (isExtractHiElt(Src, Src)) {
4264 Mods |= SISrcMods::OP_SEL_0;
4265
4266 // TODO: Should we try to look for neg/abs here?
4267 return true;
4268 }
4269
4270 if (Src.getOpcode() == ISD::TRUNCATE &&
4271 Src.getOperand(0).getValueType() == MVT::i32) {
4272 Src = Src.getOperand(0);
4273 return true;
4274 }
4275
4276 if (Subtarget->useRealTrue16Insts())
4277 // In true16 mode, pack src to a 32bit
4278 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4279 } else if (IsExtractHigh)
4280 Mods |= SISrcMods::OP_SEL_0;
4281
4282 return true;
4283}
4284
4285bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4286 SDValue &SrcMods) const {
4287 unsigned Mods = 0;
4288 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4289 return false;
4290 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4291 return true;
4292}
4293
4294bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4295 SDValue &SrcMods) const {
4296 unsigned Mods = 0;
4297 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4298 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4299 return true;
4300}
4301
4302bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4303 SDValue &SrcMods) const {
4304 unsigned Mods = 0;
4305 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4306 return false;
4307 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4308 return true;
4309}
4310
4311bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4312 SDValue &SrcMods) const {
4313 unsigned Mods = 0;
4314 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4315 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4316 return true;
4317}
4318
4319// Match BITOP3 operation and return a number of matched instructions plus
4320// truth table.
4321static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4323 unsigned NumOpcodes = 0;
4324 uint8_t LHSBits, RHSBits;
4325
4326 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4327 // Define truth table given Src0, Src1, Src2 bits permutations:
4328 // 0 0 0
4329 // 0 0 1
4330 // 0 1 0
4331 // 0 1 1
4332 // 1 0 0
4333 // 1 0 1
4334 // 1 1 0
4335 // 1 1 1
4336 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4337
4338 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4339 if (C->isAllOnes()) {
4340 Bits = 0xff;
4341 return true;
4342 }
4343 if (C->isZero()) {
4344 Bits = 0;
4345 return true;
4346 }
4347 }
4348
4349 for (unsigned I = 0; I < Src.size(); ++I) {
4350 // Try to find existing reused operand
4351 if (Src[I] == Op) {
4352 Bits = SrcBits[I];
4353 return true;
4354 }
4355 // Try to replace parent operator
4356 if (Src[I] == In) {
4357 Bits = SrcBits[I];
4358 Src[I] = Op;
4359 return true;
4360 }
4361 }
4362
4363 if (Src.size() == 3) {
4364 // No room left for operands. Try one last time, there can be a 'not' of
4365 // one of our source operands. In this case we can compute the bits
4366 // without growing Src vector.
4367 if (Op.getOpcode() == ISD::XOR) {
4368 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4369 if (C->isAllOnes()) {
4370 SDValue LHS = Op.getOperand(0);
4371 for (unsigned I = 0; I < Src.size(); ++I) {
4372 if (Src[I] == LHS) {
4373 Bits = ~SrcBits[I];
4374 return true;
4375 }
4376 }
4377 }
4378 }
4379 }
4380
4381 return false;
4382 }
4383
4384 Bits = SrcBits[Src.size()];
4385 Src.push_back(Op);
4386 return true;
4387 };
4388
4389 switch (In.getOpcode()) {
4390 case ISD::AND:
4391 case ISD::OR:
4392 case ISD::XOR: {
4393 SDValue LHS = In.getOperand(0);
4394 SDValue RHS = In.getOperand(1);
4395
4396 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4397 if (!getOperandBits(LHS, LHSBits) ||
4398 !getOperandBits(RHS, RHSBits)) {
4399 Src = std::move(Backup);
4400 return std::make_pair(0, 0);
4401 }
4402
4403 // Recursion is naturally limited by the size of the operand vector.
4404 auto Op = BitOp3_Op(LHS, Src);
4405 if (Op.first) {
4406 NumOpcodes += Op.first;
4407 LHSBits = Op.second;
4408 }
4409
4410 Op = BitOp3_Op(RHS, Src);
4411 if (Op.first) {
4412 NumOpcodes += Op.first;
4413 RHSBits = Op.second;
4414 }
4415 break;
4416 }
4417 default:
4418 return std::make_pair(0, 0);
4419 }
4420
4421 uint8_t TTbl;
4422 switch (In.getOpcode()) {
4423 case ISD::AND:
4424 TTbl = LHSBits & RHSBits;
4425 break;
4426 case ISD::OR:
4427 TTbl = LHSBits | RHSBits;
4428 break;
4429 case ISD::XOR:
4430 TTbl = LHSBits ^ RHSBits;
4431 break;
4432 default:
4433 break;
4434 }
4435
4436 return std::make_pair(NumOpcodes + 1, TTbl);
4437}
4438
4439bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4440 SDValue &Src2, SDValue &Tbl) const {
4442 uint8_t TTbl;
4443 unsigned NumOpcodes;
4444
4445 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4446
4447 // Src.empty() case can happen if all operands are all zero or all ones.
4448 // Normally it shall be optimized out before reaching this.
4449 if (NumOpcodes < 2 || Src.empty())
4450 return false;
4451
4452 // For a uniform case threshold should be higher to account for moves between
4453 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4454 // and a readtfirstlane after.
4455 if (NumOpcodes < 4 && !In->isDivergent())
4456 return false;
4457
4458 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4459 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4460 // asm more readable. This cannot be modeled with AddedComplexity because
4461 // selector does not know how many operations did we match.
4462 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4463 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4464 In.getOperand(1).getOpcode() == In.getOpcode()))
4465 return false;
4466
4467 if (In.getOpcode() == ISD::OR &&
4468 (In.getOperand(0).getOpcode() == ISD::AND ||
4469 In.getOperand(1).getOpcode() == ISD::AND))
4470 return false;
4471 }
4472
4473 // Last operand can be ignored, turning a ternary operation into a binary.
4474 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4475 // 'c' with 'a' here without changing the answer. In some pathological
4476 // cases it should be possible to get an operation with a single operand
4477 // too if optimizer would not catch it.
4478 while (Src.size() < 3)
4479 Src.push_back(Src[0]);
4480
4481 Src0 = Src[0];
4482 Src1 = Src[1];
4483 Src2 = Src[2];
4484
4485 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4486 return true;
4487}
4488
4489SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4490 if (In.isUndef())
4491 return CurDAG->getUNDEF(MVT::i32);
4492
4493 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4494 SDLoc SL(In);
4495 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4496 }
4497
4498 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4499 SDLoc SL(In);
4500 return CurDAG->getConstant(
4501 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4502 }
4503
4504 SDValue Src;
4505 if (isExtractHiElt(In, Src))
4506 return Src;
4507
4508 return SDValue();
4509}
4510
4511bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4512 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4513
4514 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4515 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4516
4517 unsigned Limit = 0;
4518 bool AllUsesAcceptSReg = true;
4519 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4520 Limit < 10 && U != E; ++U, ++Limit) {
4521 const TargetRegisterClass *RC =
4522 getOperandRegClass(U->getUser(), U->getOperandNo());
4523
4524 // If the register class is unknown, it could be an unknown
4525 // register class that needs to be an SGPR, e.g. an inline asm
4526 // constraint
4527 if (!RC || SIRI->isSGPRClass(RC))
4528 return false;
4529
4530 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4531 RC != &AMDGPU::VS_64_Align2RegClass) {
4532 AllUsesAcceptSReg = false;
4533 SDNode *User = U->getUser();
4534 if (User->isMachineOpcode()) {
4535 unsigned Opc = User->getMachineOpcode();
4536 const MCInstrDesc &Desc = SII->get(Opc);
4537 if (Desc.isCommutable()) {
4538 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4539 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4540 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4541 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4542 const TargetRegisterClass *CommutedRC =
4543 getOperandRegClass(U->getUser(), CommutedOpNo);
4544 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4545 CommutedRC == &AMDGPU::VS_64RegClass ||
4546 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4547 AllUsesAcceptSReg = true;
4548 }
4549 }
4550 }
4551 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4552 // commuting current user. This means have at least one use
4553 // that strictly require VGPR. Thus, we will not attempt to commute
4554 // other user instructions.
4555 if (!AllUsesAcceptSReg)
4556 break;
4557 }
4558 }
4559 return !AllUsesAcceptSReg && (Limit < 10);
4560}
4561
4562bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4563 const auto *Ld = cast<LoadSDNode>(N);
4564 const MachineMemOperand *MMO = Ld->getMemOperand();
4565
4566 // FIXME: We ought to able able to take the direct isDivergent result. We
4567 // cannot rely on the MMO for a uniformity check, and should stop using
4568 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4569 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4570 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4571 // version, and then this can be dropped.
4572 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4573 return false;
4574
4575 return MMO->getSize().hasValue() &&
4576 Ld->getAlign() >=
4577 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4578 uint64_t(4))) &&
4579 (MMO->isInvariant() ||
4580 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4581 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4582 (Subtarget->getScalarizeGlobalBehavior() &&
4583 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4584 Ld->isSimple() &&
4585 static_cast<const SITargetLowering *>(getTargetLowering())
4586 ->isMemOpHasNoClobberedMemOperand(N)));
4587}
4588
4591 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4592 bool IsModified = false;
4593 do {
4594 IsModified = false;
4595
4596 // Go over all selected nodes and try to fold them a bit more
4597 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4598 while (Position != CurDAG->allnodes_end()) {
4599 SDNode *Node = &*Position++;
4601 if (!MachineNode)
4602 continue;
4603
4604 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4605 if (ResNode != Node) {
4606 if (ResNode)
4607 ReplaceUses(Node, ResNode);
4608 IsModified = true;
4609 }
4610 }
4611 CurDAG->RemoveDeadNodes();
4612 } while (IsModified);
4613}
4614
4619
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:114
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
bool isSDWAOperand(const SDNode *N) const
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static SDValue stripBitcast(SDValue Val)
static const fltSemantics & BFloat()
Definition APFloat.h:295
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:316
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ TargetFrameIndex
Definition ISDOpcodes.h:187
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ BRCOND
BRCOND - Conditional branch.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:264
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.