LLVM 23.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,
133 EVT DstTy, ArrayRef<SDValue> Elts,
134 ArrayRef<unsigned> SubRegClass,
135 const SDLoc &DL) {
136 assert(Elts.size() == SubRegClass.size() && "array size mismatch");
137 unsigned NumElts = Elts.size();
138 SmallVector<SDValue, 17> Ops(2 * NumElts + 1);
139 Ops[0] = (CurDAG.getTargetConstant(DstRegClass, DL, MVT::i32));
140 for (unsigned i = 0; i < NumElts; ++i) {
141 Ops[2 * i + 1] = Elts[i];
142 Ops[2 * i + 2] = CurDAG.getTargetConstant(SubRegClass[i], DL, MVT::i32);
143 }
144 return SDValue(
145 CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops), 0);
146}
147
148} // end anonymous namespace
149
151 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
152 false)
153INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
155#ifdef EXPENSIVE_CHECKS
158#endif
160 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
161 false)
162
163/// This pass converts a legalized DAG into a AMDGPU-specific
164// DAG, ready for instruction scheduling.
166 CodeGenOptLevel OptLevel) {
167 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
168}
169
173
175 Subtarget = &MF.getSubtarget<GCNSubtarget>();
176 Subtarget->checkSubtargetFeatures(MF.getFunction());
177 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
179}
180
181bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
182 // XXX - only need to list legal operations.
183 switch (Opc) {
184 case ISD::FADD:
185 case ISD::FSUB:
186 case ISD::FMUL:
187 case ISD::FDIV:
188 case ISD::FREM:
190 case ISD::UINT_TO_FP:
191 case ISD::SINT_TO_FP:
192 case ISD::FABS:
193 // Fabs is lowered to a bit operation, but it's an and which will clear the
194 // high bits anyway.
195 case ISD::FSQRT:
196 case ISD::FSIN:
197 case ISD::FCOS:
198 case ISD::FPOWI:
199 case ISD::FPOW:
200 case ISD::FLOG:
201 case ISD::FLOG2:
202 case ISD::FLOG10:
203 case ISD::FEXP:
204 case ISD::FEXP2:
205 case ISD::FCEIL:
206 case ISD::FTRUNC:
207 case ISD::FRINT:
208 case ISD::FNEARBYINT:
209 case ISD::FROUNDEVEN:
210 case ISD::FROUND:
211 case ISD::FFLOOR:
212 case ISD::FMINNUM:
213 case ISD::FMAXNUM:
214 case ISD::FLDEXP:
215 case AMDGPUISD::FRACT:
216 case AMDGPUISD::CLAMP:
217 case AMDGPUISD::COS_HW:
218 case AMDGPUISD::SIN_HW:
219 case AMDGPUISD::FMIN3:
220 case AMDGPUISD::FMAX3:
221 case AMDGPUISD::FMED3:
222 case AMDGPUISD::FMAD_FTZ:
223 case AMDGPUISD::RCP:
224 case AMDGPUISD::RSQ:
225 case AMDGPUISD::RCP_IFLAG:
226 // On gfx10, all 16-bit instructions preserve the high bits.
227 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
228 case ISD::FP_ROUND:
229 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
230 // high bits on gfx9.
231 // TODO: If we had the source node we could see if the source was fma/mad
233 case ISD::FMA:
234 case ISD::FMAD:
235 case AMDGPUISD::DIV_FIXUP:
237 default:
238 // fcopysign, select and others may be lowered to 32-bit bit operations
239 // which don't zero the high bits.
240 return false;
241 }
242}
243
245#ifdef EXPENSIVE_CHECKS
247 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248 for (auto &L : LI->getLoopsInPreorder()) {
249 assert(L->isLCSSAForm(DT));
250 }
251#endif
253}
254
263
265 assert(Subtarget->d16PreservesUnusedBits());
266 MVT VT = N->getValueType(0).getSimpleVT();
267 if (VT != MVT::v2i16 && VT != MVT::v2f16)
268 return false;
269
270 SDValue Lo = N->getOperand(0);
271 SDValue Hi = N->getOperand(1);
272
273 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
274
275 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
276 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
277 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
278
279 // Need to check for possible indirect dependencies on the other half of the
280 // vector to avoid introducing a cycle.
281 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
282 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
283
284 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
285 SDValue Ops[] = {
286 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
287 };
288
289 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
290 if (LdHi->getMemoryVT() == MVT::i8) {
291 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
292 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
293 } else {
294 assert(LdHi->getMemoryVT() == MVT::i16);
295 }
296
297 SDValue NewLoadHi =
298 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
299 Ops, LdHi->getMemoryVT(),
300 LdHi->getMemOperand());
301
302 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
303 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
304 return true;
305 }
306
307 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
308 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
309 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
310 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
311 if (LdLo && Lo.hasOneUse()) {
312 SDValue TiedIn = getHi16Elt(Hi);
313 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
314 return false;
315
316 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
317 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
318 if (LdLo->getMemoryVT() == MVT::i8) {
319 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
320 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
321 } else {
322 assert(LdLo->getMemoryVT() == MVT::i16);
323 }
324
325 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
326
327 SDValue Ops[] = {
328 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
329 };
330
331 SDValue NewLoadLo =
332 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
333 Ops, LdLo->getMemoryVT(),
334 LdLo->getMemOperand());
335
336 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
337 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
338 return true;
339 }
340
341 return false;
342}
343
345 if (!Subtarget->d16PreservesUnusedBits())
346 return;
347
348 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
349
350 bool MadeChange = false;
351 while (Position != CurDAG->allnodes_begin()) {
352 SDNode *N = &*--Position;
353 if (N->use_empty())
354 continue;
355
356 switch (N->getOpcode()) {
358 // TODO: Match load d16 from shl (extload:i16), 16
359 MadeChange |= matchLoadD16FromBuildVector(N);
360 break;
361 default:
362 break;
363 }
364 }
365
366 if (MadeChange) {
367 CurDAG->RemoveDeadNodes();
368 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
369 CurDAG->dump(););
370 }
371}
372
373bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
374 if (N->isUndef())
375 return true;
376
377 const SIInstrInfo *TII = Subtarget->getInstrInfo();
379 return TII->isInlineConstant(C->getAPIntValue());
380
382 return TII->isInlineConstant(C->getValueAPF());
383
384 return false;
385}
386
387/// Determine the register class for \p OpNo
388/// \returns The register class of the virtual register that will be used for
389/// the given operand number \OpNo or NULL if the register class cannot be
390/// determined.
391const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
392 unsigned OpNo) const {
393 if (!N->isMachineOpcode()) {
394 if (N->getOpcode() == ISD::CopyToReg) {
395 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
396 if (Reg.isVirtual()) {
398 return MRI.getRegClass(Reg);
399 }
400
401 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
402 return TRI->getPhysRegBaseClass(Reg);
403 }
404
405 return nullptr;
406 }
407
408 switch (N->getMachineOpcode()) {
409 default: {
410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
411 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
412 unsigned OpIdx = Desc.getNumDefs() + OpNo;
413 if (OpIdx >= Desc.getNumOperands())
414 return nullptr;
415
416 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
417 if (RegClass == -1)
418 return nullptr;
419
420 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
421 }
422 case AMDGPU::REG_SEQUENCE: {
423 unsigned RCID = N->getConstantOperandVal(0);
424 const TargetRegisterClass *SuperRC =
425 Subtarget->getRegisterInfo()->getRegClass(RCID);
426
427 SDValue SubRegOp = N->getOperand(OpNo + 1);
428 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
429 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
430 SubRegIdx);
431 }
432 }
433}
434
435SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
436 SDValue Glue) const {
438 Ops.push_back(NewChain); // Replace the chain.
439 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
440 Ops.push_back(N->getOperand(i));
441
442 Ops.push_back(Glue);
443 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
444}
445
446SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
447 const SITargetLowering& Lowering =
448 *static_cast<const SITargetLowering*>(getTargetLowering());
449
450 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
451
452 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
453 return glueCopyToOp(N, M0, M0.getValue(1));
454}
455
456SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
457 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
458 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
459 if (Subtarget->ldsRequiresM0Init())
460 return glueCopyToM0(
461 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
462 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
463 MachineFunction &MF = CurDAG->getMachineFunction();
464 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
465 return
466 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
467 }
468 return N;
469}
470
471MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
472 EVT VT) const {
473 SDNode *Lo = CurDAG->getMachineNode(
474 AMDGPU::S_MOV_B32, DL, MVT::i32,
475 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
476 SDNode *Hi = CurDAG->getMachineNode(
477 AMDGPU::S_MOV_B32, DL, MVT::i32,
478 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
479 const SDValue Ops[] = {
480 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
481 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
482 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
483
484 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
485}
486
487SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
488 SelectionDAG &DAG) const {
489 // TODO: Handle undef as zero
490
491 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
492 uint32_t LHSVal, RHSVal;
493 if (getConstantValue(N->getOperand(0), LHSVal) &&
494 getConstantValue(N->getOperand(1), RHSVal)) {
495 SDLoc SL(N);
496 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
497 return DAG.getMachineNode(
498 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
499 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
500 }
501
502 return nullptr;
503}
504
505void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
506 EVT VT = N->getValueType(0);
507 unsigned NumVectorElts = VT.getVectorNumElements();
508 EVT EltVT = VT.getVectorElementType();
509 SDLoc DL(N);
510 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
511
512 if (NumVectorElts == 1) {
513 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
514 RegClass);
515 return;
516 }
517
518 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
519 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
520 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
521 uint64_t C = 0;
522 bool AllConst = true;
523 unsigned EltSize = EltVT.getSizeInBits();
524 for (unsigned I = 0; I < NumVectorElts; ++I) {
525 SDValue Op = N->getOperand(I);
526 if (Op.isUndef()) {
527 AllConst = false;
528 break;
529 }
530 uint64_t Val;
532 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
533 } else
534 Val = cast<ConstantSDNode>(Op)->getZExtValue();
535 C |= Val << (EltSize * I);
536 }
537 if (AllConst) {
538 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
539 MachineSDNode *Copy =
540 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
541 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
542 RegClass);
543 return;
544 }
545 }
546
547 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
548 "supported yet");
549 // 32 = Max Num Vector Elements
550 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
551 // 1 = Vector Register Class
552 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
553
554 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
555 bool IsRegSeq = true;
556 unsigned NOps = N->getNumOperands();
557 for (unsigned i = 0; i < NOps; i++) {
558 // XXX: Why is this here?
559 if (isa<RegisterSDNode>(N->getOperand(i))) {
560 IsRegSeq = false;
561 break;
562 }
563 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
565 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
566 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
567 }
568 if (NOps != NumVectorElts) {
569 // Fill in the missing undef elements if this was a scalar_to_vector.
570 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
571 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
572 DL, EltVT);
573 for (unsigned i = NOps; i < NumVectorElts; ++i) {
574 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
576 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
577 RegSeqArgs[1 + (2 * i) + 1] =
578 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
579 }
580 }
581
582 if (!IsRegSeq)
583 SelectCode(N);
584 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
585}
586
588 EVT VT = N->getValueType(0);
589 EVT EltVT = VT.getVectorElementType();
590
591 // TODO: Handle 16-bit element vectors with even aligned masks.
592 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
593 VT.getVectorNumElements() != 2) {
594 SelectCode(N);
595 return;
596 }
597
598 auto *SVN = cast<ShuffleVectorSDNode>(N);
599
600 SDValue Src0 = SVN->getOperand(0);
601 SDValue Src1 = SVN->getOperand(1);
602 ArrayRef<int> Mask = SVN->getMask();
603 SDLoc DL(N);
604
605 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
606 Mask[0] < 4 && Mask[1] < 4);
607
608 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
609 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
610 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
611 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
612
613 if (Mask[0] < 0) {
614 Src0SubReg = Src1SubReg;
615 MachineSDNode *ImpDef =
616 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
617 VSrc0 = SDValue(ImpDef, 0);
618 }
619
620 if (Mask[1] < 0) {
621 Src1SubReg = Src0SubReg;
622 MachineSDNode *ImpDef =
623 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
624 VSrc1 = SDValue(ImpDef, 0);
625 }
626
627 // SGPR case needs to lower to copies.
628 //
629 // Also use subregister extract when we can directly blend the registers with
630 // a simple subregister copy.
631 //
632 // TODO: Maybe we should fold this out earlier
633 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
634 Src1SubReg == AMDGPU::sub0) {
635 // The low element of the result always comes from src0.
636 // The high element of the result always comes from src1.
637 // op_sel selects the high half of src0.
638 // op_sel_hi selects the high half of src1.
639
640 unsigned Src0OpSel =
641 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
642 unsigned Src1OpSel =
643 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
644
645 // Enable op_sel_hi to avoid printing it. This should have no effect on the
646 // result.
647 Src0OpSel |= SISrcMods::OP_SEL_1;
648 Src1OpSel |= SISrcMods::OP_SEL_1;
649
650 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
651 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
652 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
653
654 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
655 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
656 ZeroMods, // clamp
657 ZeroMods, // op_sel
658 ZeroMods, // op_sel_hi
659 ZeroMods, // neg_lo
660 ZeroMods}); // neg_hi
661 return;
662 }
663
664 SDValue ResultElt0 =
665 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
666 SDValue ResultElt1 =
667 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
668
669 const SDValue Ops[] = {
670 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
671 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
672 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
673 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
674}
675
677 unsigned int Opc = N->getOpcode();
678 if (N->isMachineOpcode()) {
679 N->setNodeId(-1);
680 return; // Already selected.
681 }
682
683 // isa<MemSDNode> almost works but is slightly too permissive for some DS
684 // intrinsics.
685 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
686 N = glueCopyToM0LDSInit(N);
687 SelectCode(N);
688 return;
689 }
690
691 switch (Opc) {
692 default:
693 break;
694 // We are selecting i64 ADD here instead of custom lower it during
695 // DAG legalization, so we can fold some i64 ADDs used for address
696 // calculation into the LOAD and STORE instructions.
697 case ISD::ADDC:
698 case ISD::ADDE:
699 case ISD::SUBC:
700 case ISD::SUBE: {
701 if (N->getValueType(0) != MVT::i64)
702 break;
703
704 SelectADD_SUB_I64(N);
705 return;
706 }
707 case ISD::UADDO_CARRY:
708 case ISD::USUBO_CARRY:
709 if (N->getValueType(0) == MVT::i64) {
710 SelectAddcSubbI64(N);
711 return;
712 }
713
714 if (N->getValueType(0) != MVT::i32)
715 break;
716
717 SelectAddcSubb(N);
718 return;
719 case ISD::UADDO:
720 case ISD::USUBO: {
721 if (N->getValueType(0) == MVT::i64) {
722 SelectAddcSubbI64(N);
723 return;
724 }
725
726 SelectUADDO_USUBO(N);
727 return;
728 }
729 case AMDGPUISD::FMUL_W_CHAIN: {
730 SelectFMUL_W_CHAIN(N);
731 return;
732 }
733 case AMDGPUISD::FMA_W_CHAIN: {
734 SelectFMA_W_CHAIN(N);
735 return;
736 }
737
739 case ISD::BUILD_VECTOR: {
740 EVT VT = N->getValueType(0);
741 unsigned NumVectorElts = VT.getVectorNumElements();
742 if (VT.getScalarSizeInBits() == 16) {
743 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
744 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
745 ReplaceNode(N, Packed);
746 return;
747 }
748 }
749
750 break;
751 }
752
753 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
754 assert(VT.getVectorElementType().bitsEq(MVT::i32));
755 const TargetRegisterClass *RegClass =
756 N->isDivergent()
757 ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
758 : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
759
760 SelectBuildVector(N, RegClass->getID());
761 return;
762 }
765 return;
766 case ISD::BUILD_PAIR: {
767 SDValue RC, SubReg0, SubReg1;
768 SDLoc DL(N);
769 if (N->getValueType(0) == MVT::i128) {
770 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
771 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
772 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
773 } else if (N->getValueType(0) == MVT::i64) {
774 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
775 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
776 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
777 } else {
778 llvm_unreachable("Unhandled value type for BUILD_PAIR");
779 }
780 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
781 N->getOperand(1), SubReg1 };
782 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
783 N->getValueType(0), Ops));
784 return;
785 }
786
787 case ISD::Constant:
788 case ISD::ConstantFP: {
789 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
790 Subtarget->has64BitLiterals())
791 break;
792
793 uint64_t Imm;
795 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
796 if (AMDGPU::isValid32BitLiteral(Imm, true))
797 break;
798 } else {
800 Imm = C->getZExtValue();
801 if (AMDGPU::isValid32BitLiteral(Imm, false))
802 break;
803 }
804
805 SDLoc DL(N);
806 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
807 return;
808 }
809 case AMDGPUISD::BFE_I32:
810 case AMDGPUISD::BFE_U32: {
811 // There is a scalar version available, but unlike the vector version which
812 // has a separate operand for the offset and width, the scalar version packs
813 // the width and offset into a single operand. Try to move to the scalar
814 // version if the offsets are constant, so that we can try to keep extended
815 // loads of kernel arguments in SGPRs.
816
817 // TODO: Technically we could try to pattern match scalar bitshifts of
818 // dynamic values, but it's probably not useful.
820 if (!Offset)
821 break;
822
823 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
824 if (!Width)
825 break;
826
827 bool Signed = Opc == AMDGPUISD::BFE_I32;
828
829 uint32_t OffsetVal = Offset->getZExtValue();
830 uint32_t WidthVal = Width->getZExtValue();
831
832 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
833 WidthVal));
834 return;
835 }
836 case AMDGPUISD::DIV_SCALE: {
837 SelectDIV_SCALE(N);
838 return;
839 }
842 SelectMAD_64_32(N);
843 return;
844 }
845 case ISD::SMUL_LOHI:
846 case ISD::UMUL_LOHI:
847 return SelectMUL_LOHI(N);
848 case ISD::CopyToReg: {
850 *static_cast<const SITargetLowering*>(getTargetLowering());
851 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
852 break;
853 }
854 case ISD::AND:
855 case ISD::SRL:
856 case ISD::SRA:
858 if (N->getValueType(0) != MVT::i32)
859 break;
860
861 SelectS_BFE(N);
862 return;
863 case ISD::BRCOND:
864 SelectBRCOND(N);
865 return;
866 case ISD::FP_EXTEND:
867 SelectFP_EXTEND(N);
868 return;
869 case AMDGPUISD::CVT_PKRTZ_F16_F32:
870 case AMDGPUISD::CVT_PKNORM_I16_F32:
871 case AMDGPUISD::CVT_PKNORM_U16_F32:
872 case AMDGPUISD::CVT_PK_U16_U32:
873 case AMDGPUISD::CVT_PK_I16_I32: {
874 // Hack around using a legal type if f16 is illegal.
875 if (N->getValueType(0) == MVT::i32) {
876 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
877 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
878 { N->getOperand(0), N->getOperand(1) });
879 SelectCode(N);
880 return;
881 }
882
883 break;
884 }
886 SelectINTRINSIC_W_CHAIN(N);
887 return;
888 }
890 SelectINTRINSIC_WO_CHAIN(N);
891 return;
892 }
893 case ISD::INTRINSIC_VOID: {
894 SelectINTRINSIC_VOID(N);
895 return;
896 }
898 SelectWAVE_ADDRESS(N);
899 return;
900 }
901 case ISD::STACKRESTORE: {
902 SelectSTACKRESTORE(N);
903 return;
904 }
905 }
906
907 SelectCode(N);
908}
909
911 if (!Subtarget->hasSDWA())
912 return false;
913
914 if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
915 EVT VT = cast<VTSDNode>(N->getOperand(1))->getVT();
916 return VT.getScalarSizeInBits() == 8 || VT.getScalarSizeInBits() == 16;
917 }
918
919 if (N->getOpcode() == ISD::AND)
920 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
921 return RHS->getZExtValue() == 0xFF || RHS->getZExtValue() == 0xFFFF;
922
923 if (N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL)
924 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
925 return (RHS->getZExtValue() % 8) == 0;
926
927 return false;
928}
929
930bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
931 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
932 const Instruction *Term = BB->getTerminator();
933 return Term->getMetadata("amdgpu.uniform") ||
934 Term->getMetadata("structurizecfg.uniform");
935}
936
937bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
938 unsigned ShAmtBits) const {
939 assert(N->getOpcode() == ISD::AND);
940
941 const APInt &RHS = N->getConstantOperandAPInt(1);
942 if (RHS.countr_one() >= ShAmtBits)
943 return true;
944
945 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
946 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
947}
948
950 SDValue &N0, SDValue &N1) {
951 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
953 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
954 // (i64 (bitcast (v2i32 (build_vector
955 // (or (extract_vector_elt V, 0), OFFSET),
956 // (extract_vector_elt V, 1)))))
957 SDValue Lo = Addr.getOperand(0).getOperand(0);
958 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
959 SDValue BaseLo = Lo.getOperand(0);
960 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
961 // Check that split base (Lo and Hi) are extracted from the same one.
962 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
964 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
965 // Lo is statically extracted from index 0.
966 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
967 BaseLo.getConstantOperandVal(1) == 0 &&
968 // Hi is statically extracted from index 0.
969 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
970 BaseHi.getConstantOperandVal(1) == 1) {
971 N0 = BaseLo.getOperand(0).getOperand(0);
972 N1 = Lo.getOperand(1);
973 return true;
974 }
975 }
976 }
977 return false;
978}
979
980bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
981 SDValue &RHS) const {
982 if (CurDAG->isBaseWithConstantOffset(Addr)) {
983 LHS = Addr.getOperand(0);
984 RHS = Addr.getOperand(1);
985 return true;
986 }
987
990 return true;
991 }
992
993 return false;
994}
995
997 return "AMDGPU DAG->DAG Pattern Instruction Selection";
998}
999
1003
1007#ifdef EXPENSIVE_CHECKS
1009 .getManager();
1010 auto &F = MF.getFunction();
1011 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
1012 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
1013 for (auto &L : LI.getLoopsInPreorder())
1014 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
1015#endif
1016 return SelectionDAGISelPass::run(MF, MFAM);
1017}
1018
1019//===----------------------------------------------------------------------===//
1020// Complex Patterns
1021//===----------------------------------------------------------------------===//
1022
1023bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
1024 SDValue &Offset) {
1025 return false;
1026}
1027
1028bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
1029 SDValue &Offset) {
1031 SDLoc DL(Addr);
1032
1033 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
1034 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1035 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1036 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
1037 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
1038 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1039 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1040 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
1041 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
1042 Base = Addr.getOperand(0);
1043 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1044 } else {
1045 Base = Addr;
1046 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1047 }
1048
1049 return true;
1050}
1051
1052SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1053 const SDLoc &DL) const {
1054 SDNode *Mov = CurDAG->getMachineNode(
1055 AMDGPU::S_MOV_B32, DL, MVT::i32,
1056 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1057 return SDValue(Mov, 0);
1058}
1059
1060// Keep this as a fallback for i64 ADDC/ADDE/SUBC/SUBE glue nodes. Wide integer
1061// add/sub should normally expand through the explicit carry nodes handled in
1062// SelectAddcSubbI64.
1063void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1064 SDLoc DL(N);
1065 SDValue LHS = N->getOperand(0);
1066 SDValue RHS = N->getOperand(1);
1067
1068 unsigned Opcode = N->getOpcode();
1069 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1070 bool ProduceCarry =
1071 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1072 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1073
1074 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1075 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1076
1077 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1078 DL, MVT::i32, LHS, Sub0);
1079 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1080 DL, MVT::i32, LHS, Sub1);
1081
1082 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1083 DL, MVT::i32, RHS, Sub0);
1084 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1085 DL, MVT::i32, RHS, Sub1);
1086
1087 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1088
1089 static const unsigned OpcMap[2][2][2] = {
1090 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1091 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1092 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1093 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1094
1095 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1096 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1097
1098 SDNode *AddLo;
1099 if (!ConsumeCarry) {
1100 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1101 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1102 } else {
1103 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1104 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1105 }
1106 SDValue AddHiArgs[] = {
1107 SDValue(Hi0, 0),
1108 SDValue(Hi1, 0),
1109 SDValue(AddLo, 1)
1110 };
1111 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1112
1113 SDValue RegSequenceArgs[] = {
1114 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1115 SDValue(AddLo,0),
1116 Sub0,
1117 SDValue(AddHi,0),
1118 Sub1,
1119 };
1120 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1121 MVT::i64, RegSequenceArgs);
1122
1123 if (ProduceCarry) {
1124 // Replace the carry-use
1125 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1126 }
1127
1128 // Replace the remaining uses.
1129 ReplaceNode(N, RegSequence);
1130}
1131
1132void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1133 SDValue LHS = N->getOperand(0);
1134 SDValue RHS = N->getOperand(1);
1135 SDValue CI = N->getOperand(2);
1136
1137 if (N->isDivergent()) {
1138 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1139 : AMDGPU::V_SUBB_U32_e64;
1140 CurDAG->SelectNodeTo(
1141 N, Opc, N->getVTList(),
1142 {LHS, RHS, CI,
1143 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1144 } else {
1145 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1146 : AMDGPU::S_SUB_CO_PSEUDO;
1147 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1148 }
1149}
1150
1151void AMDGPUDAGToDAGISel::SelectAddcSubbI64(SDNode *N) {
1152 SDLoc DL(N);
1153 SDValue LHS = N->getOperand(0);
1154 SDValue RHS = N->getOperand(1);
1155
1156 unsigned Opcode = N->getOpcode();
1157 bool ConsumeCarry = Opcode == ISD::UADDO_CARRY || Opcode == ISD::USUBO_CARRY;
1158 bool IsAdd = Opcode == ISD::UADDO || Opcode == ISD::UADDO_CARRY;
1159
1160 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1161 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1162
1163 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1164 MVT::i32, LHS, Sub0);
1165 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1166 MVT::i32, LHS, Sub1);
1167
1168 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1169 MVT::i32, RHS, Sub0);
1170 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1171 MVT::i32, RHS, Sub1);
1172
1173 SDVTList VTList = CurDAG->getVTList(MVT::i32, N->getValueType(1));
1174
1175 static const unsigned NoCarryOpcMap[2][2] = {
1176 {AMDGPU::S_USUBO_PSEUDO, AMDGPU::S_UADDO_PSEUDO},
1177 {AMDGPU::V_SUB_CO_U32_e64, AMDGPU::V_ADD_CO_U32_e64}};
1178 static const unsigned CarryOpcMap[2][2] = {
1179 {AMDGPU::S_SUB_CO_PSEUDO, AMDGPU::S_ADD_CO_PSEUDO},
1180 {AMDGPU::V_SUBB_U32_e64, AMDGPU::V_ADDC_U32_e64}};
1181
1182 bool IsVALU = N->isDivergent();
1183
1184 unsigned NoCarryOpc = NoCarryOpcMap[IsVALU][IsAdd];
1185 unsigned CarryOpc = CarryOpcMap[IsVALU][IsAdd];
1186 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1187
1188 SDNode *AddLo;
1189 if (!ConsumeCarry) {
1190 if (IsVALU) {
1191 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), Clamp};
1192 AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);
1193 } else {
1194 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0)};
1195 AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);
1196 }
1197 } else {
1198 if (IsVALU) {
1199 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2),
1200 Clamp};
1201 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1202 } else {
1203 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2)};
1204 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1205 }
1206 }
1207
1208 SDNode *AddHi;
1209 if (IsVALU) {
1210 SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1),
1211 Clamp};
1212 AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1213 } else {
1214 SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1)};
1215 AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1216 }
1217
1218 unsigned RC = IsVALU ? AMDGPU::VReg_64RegClassID : AMDGPU::SReg_64RegClassID;
1219 SDValue RegSequenceArgs[] = {CurDAG->getTargetConstant(RC, DL, MVT::i32),
1220 SDValue(AddLo, 0), Sub0, SDValue(AddHi, 0),
1221 Sub1};
1222 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1223 MVT::i64, RegSequenceArgs);
1224
1225 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1226 ReplaceNode(N, RegSequence);
1227}
1228
1229void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1230 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1231 // carry out despite the _i32 name. These were renamed in VI to _U32.
1232 // FIXME: We should probably rename the opcodes here.
1233 bool IsAdd = N->getOpcode() == ISD::UADDO;
1234 bool IsVALU = N->isDivergent();
1235
1236 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1237 ++UI)
1238 if (UI.getUse().getResNo() == 1) {
1239 if (UI->isMachineOpcode()) {
1240 if (UI->getMachineOpcode() !=
1241 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1242 IsVALU = true;
1243 break;
1244 }
1245 } else {
1246 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1247 IsVALU = true;
1248 break;
1249 }
1250 }
1251 }
1252
1253 if (IsVALU) {
1254 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1255
1256 CurDAG->SelectNodeTo(
1257 N, Opc, N->getVTList(),
1258 {N->getOperand(0), N->getOperand(1),
1259 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1260 } else {
1261 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1262
1263 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1264 {N->getOperand(0), N->getOperand(1)});
1265 }
1266}
1267
1268void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1269 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1270 SDValue Ops[10];
1271
1272 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1273 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1274 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1275 Ops[8] = N->getOperand(0);
1276 Ops[9] = N->getOperand(4);
1277
1278 // If there are no source modifiers, prefer fmac over fma because it can use
1279 // the smaller VOP2 encoding.
1280 bool UseFMAC = Subtarget->hasDLInsts() &&
1281 cast<ConstantSDNode>(Ops[0])->isZero() &&
1282 cast<ConstantSDNode>(Ops[2])->isZero() &&
1283 cast<ConstantSDNode>(Ops[4])->isZero();
1284 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1285 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1286}
1287
1288void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1289 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1290 SDValue Ops[8];
1291
1292 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1293 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1294 Ops[6] = N->getOperand(0);
1295 Ops[7] = N->getOperand(3);
1296
1297 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1298}
1299
1300// We need to handle this here because tablegen doesn't support matching
1301// instructions with multiple outputs.
1302void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1303 EVT VT = N->getValueType(0);
1304
1305 assert(VT == MVT::f32 || VT == MVT::f64);
1306
1307 unsigned Opc
1308 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1309
1310 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1311 // omod
1312 SDValue Ops[8];
1313 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1314 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1315 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1316 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1317}
1318
1319// We need to handle this here because tablegen doesn't support matching
1320// instructions with multiple outputs.
1321void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1322 SDLoc SL(N);
1323 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1324 unsigned Opc;
1325 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() && !N->hasAnyUseOfValue(1);
1326 if (Subtarget->hasMADIntraFwdBug())
1327 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1328 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1329 else if (UseNoCarry)
1330 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1331 else
1332 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1333
1334 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1335 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1336 Clamp };
1337
1338 if (UseNoCarry) {
1339 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1340 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1341 CurDAG->RemoveDeadNode(N);
1342 return;
1343 }
1344
1345 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1346}
1347
1348// We need to handle this here because tablegen doesn't support matching
1349// instructions with multiple outputs.
1350void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1351 SDLoc SL(N);
1352 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1353 SDVTList VTList;
1354 unsigned Opc;
1355 if (Subtarget->hasMadNC64_32Insts()) {
1356 VTList = CurDAG->getVTList(MVT::i64);
1357 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1358 } else {
1359 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1360 if (Subtarget->hasMADIntraFwdBug()) {
1361 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1362 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1363 } else {
1364 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1365 }
1366 }
1367
1368 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1369 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1370 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1371 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1372 if (!SDValue(N, 0).use_empty()) {
1373 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1374 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1375 MVT::i32, SDValue(Mad, 0), Sub0);
1376 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1377 }
1378 if (!SDValue(N, 1).use_empty()) {
1379 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1380 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1381 MVT::i32, SDValue(Mad, 0), Sub1);
1382 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1383 }
1384 CurDAG->RemoveDeadNode(N);
1385}
1386
1387bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1388 if (!isUInt<16>(Offset))
1389 return false;
1390
1391 if (!Base || Subtarget->hasUsableDSOffset() ||
1392 Subtarget->unsafeDSOffsetFoldingEnabled())
1393 return true;
1394
1395 // On Southern Islands instruction with a negative base value and an offset
1396 // don't seem to work.
1397 return CurDAG->SignBitIsZero(Base);
1398}
1399
1400bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1401 SDValue &Offset) const {
1402 SDLoc DL(Addr);
1403 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1404 SDValue N0 = Addr.getOperand(0);
1405 SDValue N1 = Addr.getOperand(1);
1406 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1407 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1408 // (add n0, c0)
1409 Base = N0;
1410 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1411 return true;
1412 }
1413 } else if (Addr.getOpcode() == ISD::SUB) {
1414 // sub C, x -> add (sub 0, x), C
1415 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1416 int64_t ByteOffset = C->getSExtValue();
1417 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1418 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1419
1420 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1421 // the known bits in isDSOffsetLegal. We need to emit the selected node
1422 // here, so this is thrown away.
1423 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1424 Zero, Addr.getOperand(1));
1425
1426 if (isDSOffsetLegal(Sub, ByteOffset)) {
1428 Opnds.push_back(Zero);
1429 Opnds.push_back(Addr.getOperand(1));
1430
1431 // FIXME: Select to VOP3 version for with-carry.
1432 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1433 if (Subtarget->hasAddNoCarryInsts()) {
1434 SubOp = AMDGPU::V_SUB_U32_e64;
1435 Opnds.push_back(
1436 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1437 }
1438
1439 MachineSDNode *MachineSub =
1440 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1441
1442 Base = SDValue(MachineSub, 0);
1443 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1444 return true;
1445 }
1446 }
1447 }
1448 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1449 // If we have a constant address, prefer to put the constant into the
1450 // offset. This can save moves to load the constant address since multiple
1451 // operations can share the zero base address register, and enables merging
1452 // into read2 / write2 instructions.
1453
1454 SDLoc DL(Addr);
1455
1456 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1457 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1458 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1459 DL, MVT::i32, Zero);
1460 Base = SDValue(MovZero, 0);
1461 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1462 return true;
1463 }
1464 }
1465
1466 // default case
1467 Base = Addr;
1468 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1469 return true;
1470}
1471
1472bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1473 unsigned Offset1,
1474 unsigned Size) const {
1475 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1476 return false;
1477 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1478 return false;
1479
1480 if (!Base || Subtarget->hasUsableDSOffset() ||
1481 Subtarget->unsafeDSOffsetFoldingEnabled())
1482 return true;
1483
1484 // On Southern Islands instruction with a negative base value and an offset
1485 // don't seem to work.
1486 return CurDAG->SignBitIsZero(Base);
1487}
1488
1489// Return whether the operation has NoUnsignedWrap property.
1490static bool isNoUnsignedWrap(SDValue Addr) {
1491 return (Addr.getOpcode() == ISD::ADD &&
1492 Addr->getFlags().hasNoUnsignedWrap()) ||
1493 Addr->getOpcode() == ISD::OR;
1494}
1495
1496// Check that the base address of flat scratch load/store in the form of `base +
1497// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1498// requirement). We always treat the first operand as the base address here.
1499bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1500 if (isNoUnsignedWrap(Addr))
1501 return true;
1502
1503 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1504 // values.
1505 if (Subtarget->hasSignedScratchOffsets())
1506 return true;
1507
1508 auto LHS = Addr.getOperand(0);
1509 auto RHS = Addr.getOperand(1);
1510
1511 // If the immediate offset is negative and within certain range, the base
1512 // address cannot also be negative. If the base is also negative, the sum
1513 // would be either negative or much larger than the valid range of scratch
1514 // memory a thread can access.
1515 ConstantSDNode *ImmOp = nullptr;
1516 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1517 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1518 return true;
1519 }
1520
1521 return CurDAG->SignBitIsZero(LHS);
1522}
1523
1524// Check address value in SGPR/VGPR are legal for flat scratch in the form
1525// of: SGPR + VGPR.
1526bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1527 if (isNoUnsignedWrap(Addr))
1528 return true;
1529
1530 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1531 // values.
1532 if (Subtarget->hasSignedScratchOffsets())
1533 return true;
1534
1535 auto LHS = Addr.getOperand(0);
1536 auto RHS = Addr.getOperand(1);
1537 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1538}
1539
1540// Check address value in SGPR/VGPR are legal for flat scratch in the form
1541// of: SGPR + VGPR + Imm.
1542bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1543 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1544 // values.
1545 if (AMDGPU::isGFX12Plus(*Subtarget))
1546 return true;
1547
1548 auto Base = Addr.getOperand(0);
1549 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1550 // If the immediate offset is negative and within certain range, the base
1551 // address cannot also be negative. If the base is also negative, the sum
1552 // would be either negative or much larger than the valid range of scratch
1553 // memory a thread can access.
1554 if (isNoUnsignedWrap(Base) &&
1555 (isNoUnsignedWrap(Addr) ||
1556 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1557 return true;
1558
1559 auto LHS = Base.getOperand(0);
1560 auto RHS = Base.getOperand(1);
1561 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1562}
1563
1564// TODO: If offset is too big, put low 16-bit into offset.
1565bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1566 SDValue &Offset0,
1567 SDValue &Offset1) const {
1568 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1569}
1570
1571bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1572 SDValue &Offset0,
1573 SDValue &Offset1) const {
1574 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1575}
1576
1577bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1578 SDValue &Offset0, SDValue &Offset1,
1579 unsigned Size) const {
1580 SDLoc DL(Addr);
1581
1582 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1583 SDValue N0 = Addr.getOperand(0);
1584 SDValue N1 = Addr.getOperand(1);
1585 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1586 unsigned OffsetValue0 = C1->getZExtValue();
1587 unsigned OffsetValue1 = OffsetValue0 + Size;
1588
1589 // (add n0, c0)
1590 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1591 Base = N0;
1592 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1593 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1594 return true;
1595 }
1596 } else if (Addr.getOpcode() == ISD::SUB) {
1597 // sub C, x -> add (sub 0, x), C
1598 if (const ConstantSDNode *C =
1600 unsigned OffsetValue0 = C->getZExtValue();
1601 unsigned OffsetValue1 = OffsetValue0 + Size;
1602
1603 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1604 SDLoc DL(Addr);
1605 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1606
1607 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1608 // the known bits in isDSOffsetLegal. We need to emit the selected node
1609 // here, so this is thrown away.
1610 SDValue Sub =
1611 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1612
1613 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1615 Opnds.push_back(Zero);
1616 Opnds.push_back(Addr.getOperand(1));
1617 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1618 if (Subtarget->hasAddNoCarryInsts()) {
1619 SubOp = AMDGPU::V_SUB_U32_e64;
1620 Opnds.push_back(
1621 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1622 }
1623
1624 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1625 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1626
1627 Base = SDValue(MachineSub, 0);
1628 Offset0 =
1629 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1630 Offset1 =
1631 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1632 return true;
1633 }
1634 }
1635 }
1636 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1637 unsigned OffsetValue0 = CAddr->getZExtValue();
1638 unsigned OffsetValue1 = OffsetValue0 + Size;
1639
1640 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1641 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1642 MachineSDNode *MovZero =
1643 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1644 Base = SDValue(MovZero, 0);
1645 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1646 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1647 return true;
1648 }
1649 }
1650
1651 // default case
1652
1653 Base = Addr;
1654 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1655 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1656 return true;
1657}
1658
1659bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1660 SDValue &SOffset, SDValue &Offset,
1661 SDValue &Offen, SDValue &Idxen,
1662 SDValue &Addr64) const {
1663 // Subtarget prefers to use flat instruction
1664 // FIXME: This should be a pattern predicate and not reach here
1665 if (Subtarget->useFlatForGlobal())
1666 return false;
1667
1668 SDLoc DL(Addr);
1669
1670 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1671 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1672 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1673 SOffset = Subtarget->hasRestrictedSOffset()
1674 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1675 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1676
1677 ConstantSDNode *C1 = nullptr;
1678 SDValue N0 = Addr;
1679 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1680 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1681 if (isUInt<32>(C1->getZExtValue()))
1682 N0 = Addr.getOperand(0);
1683 else
1684 C1 = nullptr;
1685 }
1686
1687 if (N0->isAnyAdd()) {
1688 // (add N2, N3) -> addr64, or
1689 // (add (add N2, N3), C1) -> addr64
1690 SDValue N2 = N0.getOperand(0);
1691 SDValue N3 = N0.getOperand(1);
1692 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1693
1694 if (N2->isDivergent()) {
1695 if (N3->isDivergent()) {
1696 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1697 // addr64, and construct the resource from a 0 address.
1698 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1699 VAddr = N0;
1700 } else {
1701 // N2 is divergent, N3 is not.
1702 Ptr = N3;
1703 VAddr = N2;
1704 }
1705 } else {
1706 // N2 is not divergent.
1707 Ptr = N2;
1708 VAddr = N3;
1709 }
1710 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1711 } else if (N0->isDivergent()) {
1712 // N0 is divergent. Use it as the addr64, and construct the resource from a
1713 // 0 address.
1714 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1715 VAddr = N0;
1716 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1717 } else {
1718 // N0 -> offset, or
1719 // (N0 + C1) -> offset
1720 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1721 Ptr = N0;
1722 }
1723
1724 if (!C1) {
1725 // No offset.
1726 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1727 return true;
1728 }
1729
1730 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1731 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1732 // Legal offset for instruction.
1733 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1734 return true;
1735 }
1736
1737 // Illegal offset, store it in soffset.
1738 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1739 SOffset =
1740 SDValue(CurDAG->getMachineNode(
1741 AMDGPU::S_MOV_B32, DL, MVT::i32,
1742 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1743 0);
1744 return true;
1745}
1746
1747bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1748 SDValue &VAddr, SDValue &SOffset,
1749 SDValue &Offset) const {
1750 SDValue Ptr, Offen, Idxen, Addr64;
1751
1752 // addr64 bit was removed for volcanic islands.
1753 // FIXME: This should be a pattern predicate and not reach here
1754 if (!Subtarget->hasAddr64())
1755 return false;
1756
1757 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1758 return false;
1759
1760 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1761 if (C->getSExtValue()) {
1762 SDLoc DL(Addr);
1763
1764 const SITargetLowering& Lowering =
1765 *static_cast<const SITargetLowering*>(getTargetLowering());
1766
1767 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1768 return true;
1769 }
1770
1771 return false;
1772}
1773
1774std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1775 SDLoc DL(N);
1776
1777 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1778 SDValue TFI =
1779 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1780
1781 // We rebase the base address into an absolute stack address and hence
1782 // use constant 0 for soffset. This value must be retained until
1783 // frame elimination and eliminateFrameIndex will choose the appropriate
1784 // frame register if need be.
1785 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1786}
1787
1788bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1789 SDValue Addr, SDValue &Rsrc,
1790 SDValue &VAddr, SDValue &SOffset,
1791 SDValue &ImmOffset) const {
1792
1793 SDLoc DL(Addr);
1794 MachineFunction &MF = CurDAG->getMachineFunction();
1795 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1796
1797 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1798
1799 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1800 int64_t Imm = CAddr->getSExtValue();
1801 const int64_t NullPtr =
1803 // Don't fold null pointer.
1804 if (Imm != NullPtr) {
1805 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1806 SDValue HighBits =
1807 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1808 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1809 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1810 VAddr = SDValue(MovHighBits, 0);
1811
1812 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1813 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1814 return true;
1815 }
1816 }
1817
1818 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1819 // (add n0, c1)
1820
1821 SDValue N0 = Addr.getOperand(0);
1822 uint64_t C1 = Addr.getConstantOperandVal(1);
1823
1824 // Offsets in vaddr must be positive if range checking is enabled.
1825 //
1826 // The total computation of vaddr + soffset + offset must not overflow. If
1827 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1828 // overflowing.
1829 //
1830 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1831 // always perform a range check. If a negative vaddr base index was used,
1832 // this would fail the range check. The overall address computation would
1833 // compute a valid address, but this doesn't happen due to the range
1834 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1835 //
1836 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1837 // MUBUF vaddr, but not on older subtargets which can only do this if the
1838 // sign bit is known 0.
1839 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1840 if (TII->isLegalMUBUFImmOffset(C1) &&
1841 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1842 CurDAG->SignBitIsZero(N0))) {
1843 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1844 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1845 return true;
1846 }
1847 }
1848
1849 // (node)
1850 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1851 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1852 return true;
1853}
1854
1855static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1856 if (Val.getOpcode() != ISD::CopyFromReg)
1857 return false;
1858 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1859 if (!Reg.isPhysical())
1860 return false;
1861 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1862 return RC && TRI.isSGPRClass(RC);
1863}
1864
1865bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1866 SDValue Addr,
1867 SDValue &SRsrc,
1868 SDValue &SOffset,
1869 SDValue &Offset) const {
1870 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1871 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1872 MachineFunction &MF = CurDAG->getMachineFunction();
1873 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1874 SDLoc DL(Addr);
1875
1876 // CopyFromReg <sgpr>
1877 if (IsCopyFromSGPR(*TRI, Addr)) {
1878 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1879 SOffset = Addr;
1880 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1881 return true;
1882 }
1883
1884 ConstantSDNode *CAddr;
1885 if (Addr.getOpcode() == ISD::ADD) {
1886 // Add (CopyFromReg <sgpr>) <constant>
1887 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1888 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1889 return false;
1890 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1891 return false;
1892
1893 SOffset = Addr.getOperand(0);
1894 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1895 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1896 // <constant>
1897 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1898 } else {
1899 return false;
1900 }
1901
1902 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1903
1904 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1905 return true;
1906}
1907
1908bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1909 SDValue &SOffset, SDValue &Offset
1910 ) const {
1911 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1912 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1913
1914 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1915 return false;
1916
1917 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1918 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1919 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1920 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1921 maskTrailingOnes<uint64_t>(32); // Size
1922 SDLoc DL(Addr);
1923
1924 const SITargetLowering& Lowering =
1925 *static_cast<const SITargetLowering*>(getTargetLowering());
1926
1927 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1928 return true;
1929 }
1930 return false;
1931}
1932
1933bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1934 SDValue &SOffset) const {
1935 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1936 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1937 return true;
1938 }
1939
1940 SOffset = ByteOffsetNode;
1941 return true;
1942}
1943
1944// Find a load or store from corresponding pattern root.
1945// Roots may be build_vector, bitconvert or their combinations.
1948 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1949 return MN;
1951 for (SDValue V : N->op_values())
1952 if (MemSDNode *MN =
1954 return MN;
1955 llvm_unreachable("cannot find MemSDNode in the pattern!");
1956}
1957
1958bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(
1959 SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset,
1960 AMDGPU::FlatAddrSpace FlatVariant) const {
1962 int64_t OffsetVal = 0;
1963
1964 unsigned AS = findMemSDNode(N)->getAddressSpace();
1965
1966 bool CanHaveFlatSegmentOffsetBug =
1967 Subtarget->hasFlatSegmentOffsetBug() &&
1968 FlatVariant == FlatAddrSpace::FLAT &&
1970
1971 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1972 SDValue N0, N1;
1973 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1974 (FlatVariant != FlatAddrSpace::FlatScratch ||
1975 isFlatScratchBaseLegal(Addr))) {
1976 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1977
1978 // Adding the offset to the base address in a FLAT instruction must not
1979 // change the memory aperture in which the address falls. Therefore we can
1980 // only fold offsets from inbounds GEPs into FLAT instructions.
1981 bool IsInBounds =
1982 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1983 if (COffsetVal == 0 || FlatVariant != FlatAddrSpace::FLAT || IsInBounds) {
1984 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1985 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1986 Addr = N0;
1987 OffsetVal = COffsetVal;
1988 } else {
1989 // If the offset doesn't fit, put the low bits into the offset field
1990 // and add the rest.
1991 //
1992 // For a FLAT instruction the hardware decides whether to access
1993 // global/scratch/shared memory based on the high bits of vaddr,
1994 // ignoring the offset field, so we have to ensure that when we add
1995 // remainder to vaddr it still points into the same underlying object.
1996 // The easiest way to do that is to make sure that we split the offset
1997 // into two pieces that are both >= 0 or both <= 0.
1998
1999 SDLoc DL(N);
2000 uint64_t RemainderOffset;
2001
2002 std::tie(OffsetVal, RemainderOffset) =
2003 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
2004
2005 SDValue AddOffsetLo =
2006 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
2007 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2008
2009 if (Addr.getValueType().getSizeInBits() == 32) {
2011 Opnds.push_back(N0);
2012 Opnds.push_back(AddOffsetLo);
2013 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
2014 if (Subtarget->hasAddNoCarryInsts()) {
2015 AddOp = AMDGPU::V_ADD_U32_e64;
2016 Opnds.push_back(Clamp);
2017 }
2018 Addr =
2019 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
2020 } else {
2021 // TODO: Should this try to use a scalar add pseudo if the base
2022 // address is uniform and saddr is usable?
2023 SDValue Sub0 =
2024 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
2025 SDValue Sub1 =
2026 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
2027
2028 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
2029 DL, MVT::i32, N0, Sub0);
2030 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
2031 DL, MVT::i32, N0, Sub1);
2032
2033 SDValue AddOffsetHi =
2034 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
2035
2036 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
2037
2038 SDNode *Add =
2039 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
2040 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
2041
2042 SDNode *Addc = CurDAG->getMachineNode(
2043 AMDGPU::V_ADDC_U32_e64, DL, VTs,
2044 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
2045
2046 SDValue RegSequenceArgs[] = {
2047 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
2048 MVT::i32),
2049 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
2050
2051 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
2052 MVT::i64, RegSequenceArgs),
2053 0);
2054 }
2055 }
2056 }
2057 }
2058 }
2059
2060 VAddr = Addr;
2061 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2062 return true;
2063}
2064
2065bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
2066 SDValue &VAddr,
2067 SDValue &Offset) const {
2068 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
2070}
2071
2072bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
2073 SDValue &VAddr,
2074 SDValue &Offset) const {
2075 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
2077}
2078
2079bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
2080 SDValue &VAddr,
2081 SDValue &Offset) const {
2082 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
2084}
2085
2086// If this matches *_extend i32:x, return x
2087// Otherwise if the value is I32 returns x.
2089 const SelectionDAG *DAG) {
2090 if (Op.getValueType() == MVT::i32)
2091 return Op;
2092
2093 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
2094 Op.getOpcode() != ISD::ANY_EXTEND &&
2095 !(DAG->SignBitIsZero(Op) &&
2096 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
2097 return SDValue();
2098
2099 SDValue ExtSrc = Op.getOperand(0);
2100 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
2101}
2102
2103// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
2104// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
2105bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2106 SDValue &SAddr, SDValue &VOffset,
2107 SDValue &Offset, bool &ScaleOffset,
2108 bool NeedIOffset) const {
2110 int64_t ImmOffset = 0;
2111 ScaleOffset = false;
2112
2113 // Match the immediate offset first, which canonically is moved as low as
2114 // possible.
2115
2116 SDValue LHS, RHS;
2117 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2118 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2119 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2120
2121 if (NeedIOffset &&
2122 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
2123 FlatAddrSpace::FlatGlobal)) {
2124 Addr = LHS;
2125 ImmOffset = COffsetVal;
2126 } else if (!LHS->isDivergent()) {
2127 if (COffsetVal > 0) {
2128 SDLoc SL(N);
2129 // saddr + large_offset -> saddr +
2130 // (voffset = large_offset & ~MaxOffset) +
2131 // (large_offset & MaxOffset);
2132 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2133 if (NeedIOffset) {
2134 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2135 COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, FlatAddrSpace::FlatGlobal);
2136 }
2137
2138 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2139 : isUInt<32>(RemainderOffset)) {
2140 SDNode *VMov = CurDAG->getMachineNode(
2141 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2142 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2143 VOffset = SDValue(VMov, 0);
2144 SAddr = LHS;
2145 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2146 return true;
2147 }
2148 }
2149
2150 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2151 // is 1 we would need to perform 1 or 2 extra moves for each half of
2152 // the constant and it is better to do a scalar add and then issue a
2153 // single VALU instruction to materialize zero. Otherwise it is less
2154 // instructions to perform VALU adds with immediates or inline literals.
2155 unsigned NumLiterals =
2156 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2157 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2158 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2159 return false;
2160 }
2161 }
2162
2163 // Match the variable offset.
2164 if (Addr->isAnyAdd()) {
2165 LHS = Addr.getOperand(0);
2166
2167 if (!LHS->isDivergent()) {
2168 // add (i64 sgpr), (*_extend (i32 vgpr))
2169 RHS = Addr.getOperand(1);
2170 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2171 if (SDValue ExtRHS = matchExtFromI32orI32(
2172 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2173 SAddr = LHS;
2174 VOffset = ExtRHS;
2175 }
2176 }
2177
2178 RHS = Addr.getOperand(1);
2179 if (!SAddr && !RHS->isDivergent()) {
2180 // add (*_extend (i32 vgpr)), (i64 sgpr)
2181 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2182 if (SDValue ExtLHS = matchExtFromI32orI32(
2183 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2184 SAddr = RHS;
2185 VOffset = ExtLHS;
2186 }
2187 }
2188
2189 if (SAddr) {
2190 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2191 return true;
2192 }
2193 }
2194
2195 if (Subtarget->hasScaleOffset() &&
2196 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2199 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2200 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2201 Addr.getOperand(0)->isDivergent() &&
2203 !Addr.getOperand(2)->isDivergent()) {
2204 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2205 unsigned Size =
2206 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2207 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2208 if (ScaleOffset) {
2209 SAddr = Addr.getOperand(2);
2210 VOffset = Addr.getOperand(0);
2211 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2212 return true;
2213 }
2214 }
2215
2216 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2217 isa<ConstantSDNode>(Addr))
2218 return false;
2219
2220 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2221 // moves required to copy a 64-bit SGPR to VGPR.
2222 SAddr = Addr;
2223 SDNode *VMov =
2224 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2225 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2226 VOffset = SDValue(VMov, 0);
2227 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2228 return true;
2229}
2230
2231bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2232 SDValue &SAddr, SDValue &VOffset,
2233 SDValue &Offset,
2234 SDValue &CPol) const {
2235 bool ScaleOffset;
2236 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2237 return false;
2238
2239 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2240 SDLoc(), MVT::i32);
2241 return true;
2242}
2243
2244bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2245 SDValue &SAddr, SDValue &VOffset,
2246 SDValue &Offset,
2247 SDValue &CPol) const {
2248 bool ScaleOffset;
2249 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2250 return false;
2251
2252 // We are assuming CPol is always the last operand of the intrinsic.
2253 auto PassedCPol =
2254 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2255 CPol = CurDAG->getTargetConstant(
2256 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2257 return true;
2258}
2259
2260bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2261 SDValue &SAddr,
2262 SDValue &VOffset,
2263 SDValue &Offset,
2264 SDValue &CPol) const {
2265 bool ScaleOffset;
2266 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2267 return false;
2268
2269 // We are assuming CPol is second from last operand of the intrinsic.
2270 auto PassedCPol =
2271 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2272 CPol = CurDAG->getTargetConstant(
2273 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2274 return true;
2275}
2276
2277bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2278 SDValue &SAddr, SDValue &VOffset,
2279 SDValue &Offset,
2280 SDValue &CPol) const {
2281 bool ScaleOffset;
2282 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2283 return false;
2284
2285 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2286 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2287 return true;
2288}
2289
2290bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2291 SDValue &SAddr,
2292 SDValue &VOffset,
2293 SDValue &CPol) const {
2294 bool ScaleOffset;
2295 SDValue DummyOffset;
2296 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2297 false))
2298 return false;
2299
2300 // We are assuming CPol is always the last operand of the intrinsic.
2301 auto PassedCPol =
2302 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2303 CPol = CurDAG->getTargetConstant(
2304 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2305 return true;
2306}
2307
2308bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2309 SDValue &SAddr,
2310 SDValue &VOffset,
2311 SDValue &CPol) const {
2312 bool ScaleOffset;
2313 SDValue DummyOffset;
2314 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2315 false))
2316 return false;
2317
2318 // We are assuming CPol is second from last operand of the intrinsic.
2319 auto PassedCPol =
2320 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2321 CPol = CurDAG->getTargetConstant(
2322 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2323 return true;
2324}
2325
2327 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2328 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2329 } else if (SAddr.getOpcode() == ISD::ADD &&
2331 // Materialize this into a scalar move for scalar address to avoid
2332 // readfirstlane.
2333 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2334 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2335 FI->getValueType(0));
2336 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2337 MVT::i32, TFI, SAddr.getOperand(1)),
2338 0);
2339 }
2340
2341 return SAddr;
2342}
2343
2344// Match (32-bit SGPR base) + sext(imm offset)
2345bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2346 SDValue &SAddr,
2347 SDValue &Offset) const {
2349 if (Addr->isDivergent())
2350 return false;
2351
2352 SDLoc DL(Addr);
2353
2354 int64_t COffsetVal = 0;
2355
2356 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2357 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2358 SAddr = Addr.getOperand(0);
2359 } else {
2360 SAddr = Addr;
2361 }
2362
2363 SAddr = SelectSAddrFI(CurDAG, SAddr);
2364
2365 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2366
2367 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2368 FlatAddrSpace::FlatScratch)) {
2369 int64_t SplitImmOffset, RemainderOffset;
2370 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2371 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, FlatAddrSpace::FlatScratch);
2372
2373 COffsetVal = SplitImmOffset;
2374
2375 SDValue AddOffset =
2377 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2378 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2379 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2380 SAddr, AddOffset),
2381 0);
2382 }
2383
2384 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2385
2386 return true;
2387}
2388
2389// Check whether the flat scratch SVS swizzle bug affects this access.
2390bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2391 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2392 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2393 return false;
2394
2395 // The bug affects the swizzling of SVS accesses if there is any carry out
2396 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2397 // voffset to (soffset + inst_offset).
2398 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2399 KnownBits SKnown =
2400 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2401 KnownBits::makeConstant(APInt(32, ImmOffset,
2402 /*isSigned=*/true)));
2403 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2404 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2405 return (VMax & 3) + (SMax & 3) >= 4;
2406}
2407
2408bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2409 SDValue &VAddr, SDValue &SAddr,
2410 SDValue &Offset,
2411 SDValue &CPol) const {
2412 int64_t ImmOffset = 0;
2413
2414 SDValue LHS, RHS;
2415 SDValue OrigAddr = Addr;
2416 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2417 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2418 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2419
2420 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2422 Addr = LHS;
2423 ImmOffset = COffsetVal;
2424 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2425 SDLoc SL(N);
2426 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2427 // (large_offset & MaxOffset);
2428 int64_t SplitImmOffset, RemainderOffset;
2429 std::tie(SplitImmOffset, RemainderOffset) =
2430 TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2432
2433 if (isUInt<32>(RemainderOffset)) {
2434 SDNode *VMov = CurDAG->getMachineNode(
2435 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2436 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2437 VAddr = SDValue(VMov, 0);
2438 SAddr = LHS;
2439 if (!isFlatScratchBaseLegal(Addr))
2440 return false;
2441 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2442 return false;
2443 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2444 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2445 return true;
2446 }
2447 }
2448 }
2449
2450 if (Addr.getOpcode() != ISD::ADD)
2451 return false;
2452
2453 LHS = Addr.getOperand(0);
2454 RHS = Addr.getOperand(1);
2455
2456 if (!LHS->isDivergent() && RHS->isDivergent()) {
2457 SAddr = LHS;
2458 VAddr = RHS;
2459 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2460 SAddr = RHS;
2461 VAddr = LHS;
2462 } else {
2463 return false;
2464 }
2465
2466 if (OrigAddr != Addr) {
2467 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2468 return false;
2469 } else {
2470 if (!isFlatScratchBaseLegalSV(OrigAddr))
2471 return false;
2472 }
2473
2474 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2475 return false;
2476 SAddr = SelectSAddrFI(CurDAG, SAddr);
2477 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2478
2479 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2480 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2481 SDLoc(), MVT::i32);
2482 return true;
2483}
2484
2485// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2486// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2487// Handle the case where the Immediate Offset + SOffset is negative.
2488bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2489 bool Imm32Only,
2490 bool IsBuffer,
2491 int64_t ImmOffset) const {
2492 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2493 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2494 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2495 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2496 return false;
2497 }
2498
2499 return true;
2500}
2501
2502// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2503// the load byte size. If it is update \p Offset to a pre-scaled value and
2504// return true.
2505bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2506 bool IsSigned) const {
2507 bool ScaleOffset = false;
2508 if (!Subtarget->hasScaleOffset() || !Offset)
2509 return false;
2510
2511 unsigned Size =
2512 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2513
2514 SDValue Off = Offset;
2515 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2516 Off = Ext;
2517
2518 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2519 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2520 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2521 } else if (Offset.getOpcode() == ISD::MUL ||
2522 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2523 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2524 (Offset.isMachineOpcode() &&
2525 Offset.getMachineOpcode() ==
2526 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2527 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2528 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2529 ScaleOffset = C->getZExtValue() == Size;
2530 }
2531
2532 if (ScaleOffset)
2533 Offset = Off.getOperand(0);
2534
2535 return ScaleOffset;
2536}
2537
2538// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2539// not null) offset. If Imm32Only is true, match only 32-bit immediate
2540// offsets available on CI.
2541bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2542 SDValue *SOffset, SDValue *Offset,
2543 bool Imm32Only, bool IsBuffer,
2544 bool HasSOffset, int64_t ImmOffset,
2545 bool *ScaleOffset) const {
2546 assert((!SOffset || !Offset) &&
2547 "Cannot match both soffset and offset at the same time!");
2548
2549 if (ScaleOffset) {
2550 assert(N && SOffset);
2551
2552 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2553 }
2554
2555 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2556 if (!C) {
2557 if (!SOffset)
2558 return false;
2559
2560 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2561 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2562 *SOffset = ByteOffsetNode;
2563 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2564 ImmOffset);
2565 }
2566 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2567 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2568 *SOffset = ByteOffsetNode.getOperand(0);
2569 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2570 ImmOffset);
2571 }
2572 }
2573 return false;
2574 }
2575
2576 SDLoc SL(ByteOffsetNode);
2577
2578 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2579 // offset for S_BUFFER instructions is unsigned.
2580 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2581 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2582 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2583 if (EncodedOffset && Offset && !Imm32Only) {
2584 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2585 return true;
2586 }
2587
2588 // SGPR and literal offsets are unsigned.
2589 if (ByteOffset < 0)
2590 return false;
2591
2592 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2593 if (EncodedOffset && Offset && Imm32Only) {
2594 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2595 return true;
2596 }
2597
2598 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2599 return false;
2600
2601 if (SOffset) {
2602 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2603 *SOffset = SDValue(
2604 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2605 return true;
2606 }
2607
2608 return false;
2609}
2610
2611SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2612 if (Addr.getValueType() != MVT::i32)
2613 return Addr;
2614
2615 // Zero-extend a 32-bit address.
2616 SDLoc SL(Addr);
2617
2618 const MachineFunction &MF = CurDAG->getMachineFunction();
2619 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2620 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2621 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2622
2623 const SDValue Ops[] = {
2624 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2625 Addr,
2626 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2627 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2628 0),
2629 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2630 };
2631
2632 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2633 Ops), 0);
2634}
2635
2636// Match a base and an immediate (if Offset is not null) or an SGPR (if
2637// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2638// true, match only 32-bit immediate offsets available on CI.
2639bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2640 SDValue &SBase, SDValue *SOffset,
2641 SDValue *Offset, bool Imm32Only,
2642 bool IsBuffer, bool HasSOffset,
2643 int64_t ImmOffset,
2644 bool *ScaleOffset) const {
2645 if (SOffset && Offset) {
2646 assert(!Imm32Only && !IsBuffer);
2647 SDValue B;
2648
2649 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2650 return false;
2651
2652 int64_t ImmOff = 0;
2653 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2654 ImmOff = C->getSExtValue();
2655
2656 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2657 true, ImmOff, ScaleOffset);
2658 }
2659
2660 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2661 // wraparound, because s_load instructions perform the addition in 64 bits.
2662 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2663 !Addr->getFlags().hasNoUnsignedWrap())
2664 return false;
2665
2666 SDValue N0, N1;
2667 // Extract the base and offset if possible.
2668 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2669 N0 = Addr.getOperand(0);
2670 N1 = Addr.getOperand(1);
2671 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2672 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2673 }
2674 if (!N0 || !N1)
2675 return false;
2676
2677 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2678 ImmOffset, ScaleOffset)) {
2679 SBase = N0;
2680 return true;
2681 }
2682 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2683 ImmOffset, ScaleOffset)) {
2684 SBase = N1;
2685 return true;
2686 }
2687 return false;
2688}
2689
2690bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2691 SDValue *SOffset, SDValue *Offset,
2692 bool Imm32Only, bool *ScaleOffset) const {
2693 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2694 /* IsBuffer */ false, /* HasSOffset */ false,
2695 /* ImmOffset */ 0, ScaleOffset)) {
2696 SBase = Expand32BitAddress(SBase);
2697 return true;
2698 }
2699
2700 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2701 SBase = Expand32BitAddress(Addr);
2702 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2703 return true;
2704 }
2705
2706 return false;
2707}
2708
2709bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2710 SDValue &Offset) const {
2711 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2712 &Offset);
2713}
2714
2715bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2716 SDValue &Offset) const {
2717 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2718 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2719 &Offset, /* Imm32Only */ true);
2720}
2721
2722bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2723 SDValue &SOffset, SDValue &CPol) const {
2724 bool ScaleOffset;
2725 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2726 /* Imm32Only */ false, &ScaleOffset))
2727 return false;
2728
2729 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2730 SDLoc(N), MVT::i32);
2731 return true;
2732}
2733
2734bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2735 SDValue &SBase, SDValue &SOffset,
2736 SDValue &Offset,
2737 SDValue &CPol) const {
2738 bool ScaleOffset;
2739 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2740 return false;
2741
2742 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2743 SDLoc(N), MVT::i32);
2744 return true;
2745}
2746
2747bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2748 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2749 /* Imm32Only */ false, /* IsBuffer */ true);
2750}
2751
2752bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2753 SDValue &Offset) const {
2754 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2755 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2756 /* Imm32Only */ true, /* IsBuffer */ true);
2757}
2758
2759bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2760 SDValue &Offset) const {
2761 // Match the (soffset + offset) pair as a 32-bit register base and
2762 // an immediate offset.
2763 return N.getValueType() == MVT::i32 &&
2764 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2765 /* SOffset*/ nullptr, &Offset,
2766 /* Imm32Only */ false, /* IsBuffer */ true);
2767}
2768
2769bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2770 SDValue &Base,
2771 SDValue &Offset) const {
2772 SDLoc DL(Index);
2773
2774 if (CurDAG->isBaseWithConstantOffset(Index)) {
2775 SDValue N0 = Index.getOperand(0);
2776 SDValue N1 = Index.getOperand(1);
2777 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2778
2779 // (add n0, c0)
2780 // Don't peel off the offset (c0) if doing so could possibly lead
2781 // the base (n0) to be negative.
2782 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2783 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2784 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2785 Base = N0;
2786 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2787 return true;
2788 }
2789 }
2790
2791 if (isa<ConstantSDNode>(Index))
2792 return false;
2793
2794 Base = Index;
2795 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2796 return true;
2797}
2798
2799SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2800 SDValue Val, uint32_t Offset,
2801 uint32_t Width) {
2802 if (Val->isDivergent()) {
2803 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2804 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2805 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2806
2807 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2808 }
2809 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2810 // Transformation function, pack the offset and width of a BFE into
2811 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2812 // source, bits [5:0] contain the offset and bits [22:16] the width.
2813 uint32_t PackedVal = Offset | (Width << 16);
2814 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2815
2816 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2817}
2818
2819void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2820 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2821 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2822 // Predicate: 0 < b <= c < 32
2823
2824 const SDValue &Shl = N->getOperand(0);
2825 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2826 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2827
2828 if (B && C) {
2829 uint32_t BVal = B->getZExtValue();
2830 uint32_t CVal = C->getZExtValue();
2831
2832 if (0 < BVal && BVal <= CVal && CVal < 32) {
2833 bool Signed = N->getOpcode() == ISD::SRA;
2834 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2835 32 - CVal));
2836 return;
2837 }
2838 }
2839 SelectCode(N);
2840}
2841
2842void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2843 switch (N->getOpcode()) {
2844 case ISD::AND:
2845 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2846 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2847 // Predicate: isMask(mask)
2848 const SDValue &Srl = N->getOperand(0);
2849 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2850 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2851
2852 if (Shift && Mask) {
2853 uint32_t ShiftVal = Shift->getZExtValue();
2854 uint32_t MaskVal = Mask->getZExtValue();
2855
2856 if (isMask_32(MaskVal)) {
2857 uint32_t WidthVal = llvm::popcount(MaskVal);
2858 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2859 WidthVal));
2860 return;
2861 }
2862 }
2863 }
2864 break;
2865 case ISD::SRL:
2866 if (N->getOperand(0).getOpcode() == ISD::AND) {
2867 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2868 // Predicate: isMask(mask >> b)
2869 const SDValue &And = N->getOperand(0);
2870 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2871 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2872
2873 if (Shift && Mask) {
2874 uint32_t ShiftVal = Shift->getZExtValue();
2875 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2876
2877 if (isMask_32(MaskVal)) {
2878 uint32_t WidthVal = llvm::popcount(MaskVal);
2879 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2880 WidthVal));
2881 return;
2882 }
2883 }
2884 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2885 SelectS_BFEFromShifts(N);
2886 return;
2887 }
2888 break;
2889 case ISD::SRA:
2890 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2891 SelectS_BFEFromShifts(N);
2892 return;
2893 }
2894 break;
2895
2897 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2898 SDValue Src = N->getOperand(0);
2899 if (Src.getOpcode() != ISD::SRL)
2900 break;
2901
2902 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2903 if (!Amt)
2904 break;
2905
2906 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2907 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2908 Amt->getZExtValue(), Width));
2909 return;
2910 }
2911 }
2912
2913 SelectCode(N);
2914}
2915
2916bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2917 assert(N->getOpcode() == ISD::BRCOND);
2918 if (!N->hasOneUse())
2919 return false;
2920
2921 SDValue Cond = N->getOperand(1);
2922 if (Cond.getOpcode() == ISD::CopyToReg)
2923 Cond = Cond.getOperand(2);
2924
2925 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2926 return false;
2927
2928 MVT VT = Cond.getOperand(0).getSimpleValueType();
2929 if (VT == MVT::i32)
2930 return true;
2931
2932 if (VT == MVT::i64) {
2933 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2934 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2935 Subtarget->hasScalarCompareEq64();
2936 }
2937
2938 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2939 return true;
2940
2941 return false;
2942}
2943
2944static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2945 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2946 // Special case for amdgcn.ballot:
2947 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2948 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2949 // =>
2950 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2951 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2952 // Cond becomes a i(WaveSize) full mask value.
2953 // Note that ballot doesn't use SETEQ condition but its easy to support it
2954 // here for completeness, so in this case Negate is set true on return.
2955 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2956 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2957 isNullConstant(VCMP.getOperand(1))) {
2958
2959 auto Cond = VCMP.getOperand(0);
2960 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2961 Cond = Cond.getOperand(0);
2962
2963 if (isBoolSGPR(Cond)) {
2964 Negate = VCMP_CC == ISD::SETEQ;
2965 return Cond;
2966 }
2967 }
2968 return SDValue();
2969}
2970
2971void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2972 SDValue Cond = N->getOperand(1);
2973
2974 if (Cond.isUndef()) {
2975 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2976 N->getOperand(2), N->getOperand(0));
2977 return;
2978 }
2979
2980 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2981
2982 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2983 bool AndExec = !UseSCCBr;
2984 bool Negate = false;
2985
2986 if (Cond.getOpcode() == ISD::SETCC &&
2987 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2988 SDValue VCMP = Cond->getOperand(0);
2989 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2990 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2991 isNullConstant(Cond->getOperand(1)) &&
2992 // We may encounter ballot.i64 in wave32 mode on -O0.
2993 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2994 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2995 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2996 // BRCOND i1 %C, %BB
2997 // =>
2998 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2999 // VCC = COPY i(WaveSize) %VCMP
3000 // S_CBRANCH_VCCNZ/VCCZ %BB
3001 Negate = CC == ISD::SETEQ;
3002 bool NegatedBallot = false;
3003 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
3004 Cond = BallotCond;
3005 UseSCCBr = !BallotCond->isDivergent();
3006 Negate = Negate ^ NegatedBallot;
3007 } else {
3008 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
3009 // selected as V_CMP, but this may change for uniform condition.
3010 Cond = VCMP;
3011 UseSCCBr = false;
3012 }
3013 }
3014 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
3015 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
3016 // used.
3017 AndExec = false;
3018 }
3019
3020 unsigned BrOp =
3021 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
3022 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
3023 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
3024 SDLoc SL(N);
3025
3026 if (AndExec) {
3027 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
3028 // analyzed what generates the vcc value, so we do not know whether vcc
3029 // bits for disabled lanes are 0. Thus we need to mask out bits for
3030 // disabled lanes.
3031 //
3032 // For the case that we select S_CBRANCH_SCC1 and it gets
3033 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
3034 // SIInstrInfo::moveToVALU which inserts the S_AND).
3035 //
3036 // We could add an analysis of what generates the vcc value here and omit
3037 // the S_AND when is unnecessary. But it would be better to add a separate
3038 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
3039 // catches both cases.
3040 Cond = SDValue(
3041 CurDAG->getMachineNode(
3042 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
3043 MVT::i1,
3044 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
3045 : AMDGPU::EXEC,
3046 MVT::i1),
3047 Cond),
3048 0);
3049 }
3050
3051 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
3052 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
3053 N->getOperand(2), // Basic Block
3054 VCC.getValue(0));
3055}
3056
3057void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
3058 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
3059 !N->isDivergent()) {
3060 SDValue Src = N->getOperand(0);
3061 if (Src.getValueType() == MVT::f16) {
3062 if (isExtractHiElt(Src, Src)) {
3063 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
3064 {Src});
3065 return;
3066 }
3067 }
3068 }
3069
3070 SelectCode(N);
3071}
3072
3073void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
3074 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
3075 // be copied to an SGPR with readfirstlane.
3076 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
3077 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
3078
3079 SDValue Chain = N->getOperand(0);
3080 SDValue Ptr = N->getOperand(2);
3081 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3082 MachineMemOperand *MMO = M->getMemOperand();
3083 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
3084
3086 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
3087 SDValue PtrBase = Ptr.getOperand(0);
3088 SDValue PtrOffset = Ptr.getOperand(1);
3089
3090 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
3091 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
3092 N = glueCopyToM0(N, PtrBase);
3093 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
3094 }
3095 }
3096
3097 if (!Offset) {
3098 N = glueCopyToM0(N, Ptr);
3099 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
3100 }
3101
3102 SDValue Ops[] = {
3103 Offset,
3104 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
3105 Chain,
3106 N->getOperand(N->getNumOperands() - 1) // New glue
3107 };
3108
3109 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3110 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3111}
3112
3113// We need to handle this here because tablegen doesn't support matching
3114// instructions with multiple outputs.
3115void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
3116 unsigned Opc;
3117 switch (IntrID) {
3118 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3119 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3120 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
3121 break;
3122 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3123 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
3124 break;
3125 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3126 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
3127 break;
3128 }
3129 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
3130 N->getOperand(5), N->getOperand(0)};
3131
3132 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3133 MachineMemOperand *MMO = M->getMemOperand();
3134 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3135 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3136}
3137
3138void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3139 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3140 unsigned Opc =
3141 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3142
3143 SmallVector<SDValue, 7> TensorOps;
3144 // First two groups
3145 TensorOps.push_back(N->getOperand(2)); // D# group 0
3146 TensorOps.push_back(N->getOperand(3)); // D# group 1
3147
3148 // Use _D2 version if both group 2 and 3 are zero-initialized.
3149 SDValue Group2 = N->getOperand(4);
3150 SDValue Group3 = N->getOperand(5);
3151 if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3153 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3154 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3155 } else { // Has at least 4 groups
3156 TensorOps.push_back(Group2); // D# group 2
3157 TensorOps.push_back(Group3); // D# group 3
3158 }
3159
3160 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3161 // for now because all existing targets only support up to 4 groups.
3162 TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3163 TensorOps.push_back(N->getOperand(7)); // cache policy
3164 TensorOps.push_back(N->getOperand(0)); // chain
3165
3166 (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3167}
3168
3169static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3170 switch (IntrID) {
3171 case Intrinsic::amdgcn_ds_gws_init:
3172 return AMDGPU::DS_GWS_INIT;
3173 case Intrinsic::amdgcn_ds_gws_barrier:
3174 return AMDGPU::DS_GWS_BARRIER;
3175 case Intrinsic::amdgcn_ds_gws_sema_v:
3176 return AMDGPU::DS_GWS_SEMA_V;
3177 case Intrinsic::amdgcn_ds_gws_sema_br:
3178 return AMDGPU::DS_GWS_SEMA_BR;
3179 case Intrinsic::amdgcn_ds_gws_sema_p:
3180 return AMDGPU::DS_GWS_SEMA_P;
3181 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3182 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3183 default:
3184 llvm_unreachable("not a gws intrinsic");
3185 }
3186}
3187
3188void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3189 if (!Subtarget->hasGWS() ||
3190 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3191 !Subtarget->hasGWSSemaReleaseAll())) {
3192 // Let this error.
3193 SelectCode(N);
3194 return;
3195 }
3196
3197 // Chain, intrinsic ID, vsrc, offset
3198 const bool HasVSrc = N->getNumOperands() == 4;
3199 assert(HasVSrc || N->getNumOperands() == 3);
3200
3201 SDLoc SL(N);
3202 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3203 int ImmOffset = 0;
3204 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3205 MachineMemOperand *MMO = M->getMemOperand();
3206
3207 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3208 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3209
3210 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3211 // offset field) % 64. Some versions of the programming guide omit the m0
3212 // part, or claim it's from offset 0.
3213 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3214 // If we have a constant offset, try to use the 0 in m0 as the base.
3215 // TODO: Look into changing the default m0 initialization value. If the
3216 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3217 // the immediate offset.
3218 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3219 ImmOffset = ConstOffset->getZExtValue();
3220 } else {
3221 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3222 ImmOffset = BaseOffset.getConstantOperandVal(1);
3223 BaseOffset = BaseOffset.getOperand(0);
3224 }
3225
3226 // Prefer to do the shift in an SGPR since it should be possible to use m0
3227 // as the result directly. If it's already an SGPR, it will be eliminated
3228 // later.
3229 SDNode *SGPROffset
3230 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3231 BaseOffset);
3232 // Shift to offset in m0
3233 SDNode *M0Base
3234 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3235 SDValue(SGPROffset, 0),
3236 CurDAG->getTargetConstant(16, SL, MVT::i32));
3237 glueCopyToM0(N, SDValue(M0Base, 0));
3238 }
3239
3240 SDValue Chain = N->getOperand(0);
3241 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3242
3243 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3244
3245 const MCInstrDesc &InstrDesc = TII->get(Opc);
3246 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3247
3248 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3249
3251 if (HasVSrc) {
3252 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3253
3254 SDValue Data = N->getOperand(2);
3255 MVT DataVT = Data.getValueType().getSimpleVT();
3256 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3257 // Normal 32-bit case.
3258 Ops.push_back(N->getOperand(2));
3259 } else {
3260 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3261 // even aligned 64-bit register class.
3262 const SDValue RegSeqOps[] = {
3263 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3264 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3265 SDValue(
3266 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3267 0),
3268 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3269
3270 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3271 SL, MVT::v2i32, RegSeqOps),
3272 0));
3273 }
3274 }
3275
3276 Ops.push_back(OffsetField);
3277 Ops.push_back(Chain);
3278
3279 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3280 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3281}
3282
3283void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3284 if (Subtarget->getLDSBankCount() != 16) {
3285 // This is a single instruction with a pattern.
3286 SelectCode(N);
3287 return;
3288 }
3289
3290 SDLoc DL(N);
3291
3292 // This requires 2 instructions. It is possible to write a pattern to support
3293 // this, but the generated isel emitter doesn't correctly deal with multiple
3294 // output instructions using the same physical register input. The copy to m0
3295 // is incorrectly placed before the second instruction.
3296 //
3297 // TODO: Match source modifiers.
3298 //
3299 // def : Pat <
3300 // (int_amdgcn_interp_p1_f16
3301 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3302 // (i32 timm:$attrchan), (i32 timm:$attr),
3303 // (i1 timm:$high), M0),
3304 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3305 // timm:$attrchan, 0,
3306 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3307 // let Predicates = [has16BankLDS];
3308 // }
3309
3310 // 16 bank LDS
3311 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3312 N->getOperand(5), SDValue());
3313
3314 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3315
3316 SDNode *InterpMov =
3317 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3318 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3319 N->getOperand(3), // Attr
3320 N->getOperand(2), // Attrchan
3321 ToM0.getValue(1) // In glue
3322 });
3323
3324 SDNode *InterpP1LV =
3325 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3326 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3327 N->getOperand(1), // Src0
3328 N->getOperand(3), // Attr
3329 N->getOperand(2), // Attrchan
3330 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3331 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3332 N->getOperand(4), // high
3333 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3334 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3335 SDValue(InterpMov, 1)
3336 });
3337
3338 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3339}
3340
3341void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3342 unsigned IntrID = N->getConstantOperandVal(1);
3343 switch (IntrID) {
3344 case Intrinsic::amdgcn_ds_append:
3345 case Intrinsic::amdgcn_ds_consume: {
3346 if (N->getValueType(0) != MVT::i32)
3347 break;
3348 SelectDSAppendConsume(N, IntrID);
3349 return;
3350 }
3351 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3352 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3353 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3354 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3355 SelectDSBvhStackIntrinsic(N, IntrID);
3356 return;
3357 case Intrinsic::amdgcn_init_whole_wave:
3358 CurDAG->getMachineFunction()
3359 .getInfo<SIMachineFunctionInfo>()
3360 ->setInitWholeWave();
3361 break;
3362 }
3363
3364 SelectCode(N);
3365}
3366
3367void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3368 unsigned IntrID = N->getConstantOperandVal(0);
3369 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3370 SDNode *ConvGlueNode = N->getGluedNode();
3371 if (ConvGlueNode) {
3372 // FIXME: Possibly iterate over multiple glue nodes?
3373 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3374 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3375 ConvGlueNode =
3376 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3377 MVT::Glue, SDValue(ConvGlueNode, 0));
3378 } else {
3379 ConvGlueNode = nullptr;
3380 }
3381 switch (IntrID) {
3382 case Intrinsic::amdgcn_wqm:
3383 Opcode = AMDGPU::WQM;
3384 break;
3385 case Intrinsic::amdgcn_softwqm:
3386 Opcode = AMDGPU::SOFT_WQM;
3387 break;
3388 case Intrinsic::amdgcn_wwm:
3389 case Intrinsic::amdgcn_strict_wwm:
3390 Opcode = AMDGPU::STRICT_WWM;
3391 break;
3392 case Intrinsic::amdgcn_strict_wqm:
3393 Opcode = AMDGPU::STRICT_WQM;
3394 break;
3395 case Intrinsic::amdgcn_interp_p1_f16:
3396 SelectInterpP1F16(N);
3397 return;
3398 case Intrinsic::amdgcn_permlane16_swap:
3399 case Intrinsic::amdgcn_permlane32_swap: {
3400 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3401 !Subtarget->hasPermlane16Swap()) ||
3402 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3403 !Subtarget->hasPermlane32Swap())) {
3404 SelectCode(N); // Hit the default error
3405 return;
3406 }
3407
3408 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3409 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3410 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3411
3412 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3413 if (ConvGlueNode)
3414 NewOps.push_back(SDValue(ConvGlueNode, 0));
3415
3416 bool FI = N->getConstantOperandVal(3);
3417 NewOps[2] = CurDAG->getTargetConstant(
3418 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3419
3420 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3421 return;
3422 }
3423 default:
3424 SelectCode(N);
3425 break;
3426 }
3427
3428 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3429 SDValue Src = N->getOperand(1);
3430 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3431 }
3432
3433 if (ConvGlueNode) {
3434 SmallVector<SDValue, 4> NewOps(N->ops());
3435 NewOps.push_back(SDValue(ConvGlueNode, 0));
3436 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3437 }
3438}
3439
3440void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3441 unsigned IntrID = N->getConstantOperandVal(1);
3442 switch (IntrID) {
3443 case Intrinsic::amdgcn_ds_gws_init:
3444 case Intrinsic::amdgcn_ds_gws_barrier:
3445 case Intrinsic::amdgcn_ds_gws_sema_v:
3446 case Intrinsic::amdgcn_ds_gws_sema_br:
3447 case Intrinsic::amdgcn_ds_gws_sema_p:
3448 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3449 SelectDS_GWS(N, IntrID);
3450 return;
3451 case Intrinsic::amdgcn_tensor_load_to_lds:
3452 case Intrinsic::amdgcn_tensor_store_from_lds:
3453 SelectTensorLoadStore(N, IntrID);
3454 return;
3455 default:
3456 break;
3457 }
3458
3459 SelectCode(N);
3460}
3461
3462void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3463 SDValue Log2WaveSize =
3464 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3465 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3466 {N->getOperand(0), Log2WaveSize});
3467}
3468
3469void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3470 SDValue SrcVal = N->getOperand(1);
3471 if (SrcVal.getValueType() != MVT::i32) {
3472 SelectCode(N); // Emit default error
3473 return;
3474 }
3475
3476 SDValue CopyVal;
3477 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3478 SDLoc SL(N);
3479
3480 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3481 CopyVal = SrcVal.getOperand(0);
3482 } else {
3483 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3484 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3485
3486 if (N->isDivergent()) {
3487 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3488 MVT::i32, SrcVal),
3489 0);
3490 }
3491
3492 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3493 {SrcVal, Log2WaveSize}),
3494 0);
3495 }
3496
3497 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3498 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3499}
3500
3501bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3502 unsigned &Mods,
3503 bool IsCanonicalizing,
3504 bool AllowAbs) const {
3505 Mods = SISrcMods::NONE;
3506 Src = In;
3507
3508 if (Src.getOpcode() == ISD::FNEG) {
3509 Mods |= SISrcMods::NEG;
3510 Src = Src.getOperand(0);
3511 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3512 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3513 // denormal mode, but we're implicitly canonicalizing in a source operand.
3514 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3515 if (LHS && LHS->isZero()) {
3516 Mods |= SISrcMods::NEG;
3517 Src = Src.getOperand(1);
3518 }
3519 }
3520
3521 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3522 Mods |= SISrcMods::ABS;
3523 Src = Src.getOperand(0);
3524 }
3525
3526 if (Mods != SISrcMods::NONE)
3527 return true;
3528
3529 // Convert various sign-bit masks on integers to src mods. Currently disabled
3530 // for 16-bit types as the codegen replaces the operand without adding a
3531 // srcmod. This is intentionally finding the cases where we are performing
3532 // float neg and abs on int types, the goal is not to obtain two's complement
3533 // neg or abs. Limit converison to select operands via the nonCanonalizing
3534 // pattern.
3535 // TODO: Add 16-bit support.
3536 if (IsCanonicalizing)
3537 return true;
3538
3539 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3540 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3541 // through the extract to the bitwise op.
3542 SDValue PeekSrc =
3543 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3544 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3545 // types as the codegen replaces the operand without adding a srcmod.
3546 // This is intentionally finding the cases where we are performing float neg
3547 // and abs on int types, the goal is not to obtain two's complement neg or
3548 // abs.
3549 // TODO: Add 16-bit support.
3550 unsigned Opc = PeekSrc.getOpcode();
3551 EVT VT = Src.getValueType();
3552 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3553 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3554 return true;
3555
3556 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3557 if (!CRHS)
3558 return true;
3559
3560 auto ReplaceSrc = [&]() -> SDValue {
3561 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3562 return Src.getOperand(0);
3563
3564 SDValue LHS = PeekSrc->getOperand(0);
3565 SDValue Index = Src->getOperand(1);
3566 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3567 Src.getValueType(), LHS, Index);
3568 };
3569
3570 // Recognise Srcmods:
3571 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3572 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3573 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3574 // SrcModifiers.
3575 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3576 Mods |= SISrcMods::NEG;
3577 Src = ReplaceSrc();
3578 } else if (Opc == ISD::AND && AllowAbs &&
3579 CRHS->getAPIntValue().isMaxSignedValue()) {
3580 Mods |= SISrcMods::ABS;
3581 Src = ReplaceSrc();
3582 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3584 Src = ReplaceSrc();
3585 }
3586
3587 return true;
3588}
3589
3590bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3591 SDValue &SrcMods) const {
3592 unsigned Mods;
3593 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3594 /*AllowAbs=*/true)) {
3595 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3596 return true;
3597 }
3598
3599 return false;
3600}
3601
3602bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3603 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3604 unsigned Mods;
3605 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3606 /*AllowAbs=*/true)) {
3607 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3608 return true;
3609 }
3610
3611 return false;
3612}
3613
3614bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3615 SDValue &SrcMods) const {
3616 unsigned Mods;
3617 if (SelectVOP3ModsImpl(In, Src, Mods,
3618 /*IsCanonicalizing=*/true,
3619 /*AllowAbs=*/false)) {
3620 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3621 return true;
3622 }
3623
3624 return false;
3625}
3626
3627bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3628 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3629 return false;
3630
3631 Src = In;
3632 return true;
3633}
3634
3635bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3636 SDValue &SrcMods,
3637 bool OpSel) const {
3638 unsigned Mods;
3639 if (SelectVOP3ModsImpl(In, Src, Mods,
3640 /*IsCanonicalizing=*/true,
3641 /*AllowAbs=*/false)) {
3642 if (OpSel)
3643 Mods |= SISrcMods::OP_SEL_0;
3644 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3645 return true;
3646 }
3647
3648 return false;
3649}
3650
3651bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3652 SDValue &SrcMods) const {
3653 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3654}
3655
3656bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3657 SDValue &SrcMods) const {
3658 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3659}
3660
3661bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3662 SDValue &SrcMods, SDValue &Clamp,
3663 SDValue &Omod) const {
3664 SDLoc DL(In);
3665 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3666 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3667
3668 return SelectVOP3Mods(In, Src, SrcMods);
3669}
3670
3671bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3672 SDValue &SrcMods, SDValue &Clamp,
3673 SDValue &Omod) const {
3674 SDLoc DL(In);
3675 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3676 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3677
3678 return SelectVOP3BMods(In, Src, SrcMods);
3679}
3680
3681bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3682 SDValue &Clamp, SDValue &Omod) const {
3683 Src = In;
3684
3685 SDLoc DL(In);
3686 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3687 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3688
3689 return true;
3690}
3691
3692bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3693 SDValue &SrcMods, bool IsDOT) const {
3694 unsigned Mods = SISrcMods::NONE;
3695 Src = In;
3696
3697 // TODO: Handle G_FSUB 0 as fneg
3698 if (Src.getOpcode() == ISD::FNEG) {
3700 Src = Src.getOperand(0);
3701 }
3702
3703 // 64-bit VOP3P instructions do not have OPSEL or ABS. Bail on v2f64 or v2i64.
3704 // TODO: Select NEG_LO and NEG_HI modifiers from BUILD_VECTOR.
3705 if (Src.getValueSizeInBits() == 128) {
3706 Mods |= SISrcMods::OP_SEL_1; // Just the default, OPSEL unsupported.
3707 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3708 return true;
3709 }
3710
3711 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3712 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3713 unsigned VecMods = Mods;
3714
3715 SDValue Lo = stripBitcast(Src.getOperand(0));
3716 SDValue Hi = stripBitcast(Src.getOperand(1));
3717
3718 if (Lo.getOpcode() == ISD::FNEG) {
3719 Lo = stripBitcast(Lo.getOperand(0));
3720 Mods ^= SISrcMods::NEG;
3721 }
3722
3723 if (Hi.getOpcode() == ISD::FNEG) {
3724 Hi = stripBitcast(Hi.getOperand(0));
3725 Mods ^= SISrcMods::NEG_HI;
3726 }
3727
3728 if (isExtractHiElt(Lo, Lo))
3729 Mods |= SISrcMods::OP_SEL_0;
3730
3731 if (isExtractHiElt(Hi, Hi))
3732 Mods |= SISrcMods::OP_SEL_1;
3733
3734 unsigned VecSize = Src.getValueSizeInBits();
3735 Lo = stripExtractLoElt(Lo);
3736 Hi = stripExtractLoElt(Hi);
3737
3738 if (Lo.getValueSizeInBits() > VecSize) {
3739 Lo = CurDAG->getTargetExtractSubreg(
3740 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3741 MVT::getIntegerVT(VecSize), Lo);
3742 }
3743
3744 if (Hi.getValueSizeInBits() > VecSize) {
3745 Hi = CurDAG->getTargetExtractSubreg(
3746 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3747 MVT::getIntegerVT(VecSize), Hi);
3748 }
3749
3750 assert(Lo.getValueSizeInBits() <= VecSize &&
3751 Hi.getValueSizeInBits() <= VecSize);
3752
3753 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3754 // Really a scalar input. Just select from the low half of the register to
3755 // avoid packing.
3756
3757 if (VecSize == Lo.getValueSizeInBits()) {
3758 Src = Lo;
3759 } else if (VecSize == 32) {
3760 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3761 } else {
3762 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3763
3764 SDLoc SL(In);
3766 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3767 Lo.getValueType()), 0);
3768 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3769 : AMDGPU::SReg_64RegClassID;
3770 const SDValue Ops[] = {
3771 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3772 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3773 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3774
3775 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3776 Src.getValueType(), Ops), 0);
3777 }
3778 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3779 return true;
3780 }
3781
3782 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3783 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3784 .bitcastToAPInt().getZExtValue();
3785 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3786 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3787 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3788 return true;
3789 }
3790 }
3791
3792 Mods = VecMods;
3793 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3794 Src.getNumOperands() == 2) {
3795
3796 // TODO: We should repeat the build_vector source check above for the
3797 // vector_shuffle for negates and casts of individual elements.
3798
3799 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3800 ArrayRef<int> Mask = SVN->getMask();
3801
3802 if (Mask[0] < 2 && Mask[1] < 2) {
3803 // src1 should be undef.
3804 SDValue ShuffleSrc = SVN->getOperand(0);
3805
3806 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3807 ShuffleSrc = ShuffleSrc.getOperand(0);
3809 }
3810
3811 if (Mask[0] == 1)
3812 Mods |= SISrcMods::OP_SEL_0;
3813 if (Mask[1] == 1)
3814 Mods |= SISrcMods::OP_SEL_1;
3815
3816 Src = ShuffleSrc;
3817 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3818 return true;
3819 }
3820 }
3821
3822 // Packed instructions do not have abs modifiers.
3823 Mods |= SISrcMods::OP_SEL_1;
3824
3825 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3826 return true;
3827}
3828
3829bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3830 SDValue &SrcMods) const {
3831 return SelectVOP3PMods(In, Src, SrcMods, true);
3832}
3833
3834bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
3835 SDValue SrcTmp, SrcModsTmp;
3836 SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);
3837 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3838 Src = SrcTmp;
3839 return true;
3840 }
3841
3842 return false;
3843}
3844
3845bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
3846 SDValue &SrcMods) const {
3847 SelectVOP3Mods(In, Src, SrcMods);
3848 unsigned Mods = SISrcMods::OP_SEL_1;
3849 Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();
3850 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3851 return true;
3852}
3853
3854bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
3855 SDValue SrcTmp, SrcModsTmp;
3856 SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);
3857 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3858 Src = SrcTmp;
3859 return true;
3860 }
3861
3862 return false;
3863}
3864
3865bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3866 SDValue &Src) const {
3867 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3868 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3869
3870 unsigned Mods = SISrcMods::OP_SEL_1;
3871 unsigned SrcVal = C->getZExtValue();
3872 if (SrcVal == 1)
3873 Mods |= SISrcMods::OP_SEL_0;
3874
3875 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3876 return true;
3877}
3878
3880AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3881 const SDLoc &DL) const {
3882 unsigned DstRegClass;
3883 EVT DstTy;
3884 switch (Elts.size()) {
3885 case 8:
3886 DstRegClass = AMDGPU::VReg_256RegClassID;
3887 DstTy = MVT::v8i32;
3888 break;
3889 case 4:
3890 DstRegClass = AMDGPU::VReg_128RegClassID;
3891 DstTy = MVT::v4i32;
3892 break;
3893 case 2:
3894 DstRegClass = AMDGPU::VReg_64RegClassID;
3895 DstTy = MVT::v2i32;
3896 break;
3897 default:
3898 llvm_unreachable("unhandled Reg sequence size");
3899 }
3900
3902 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3903 for (unsigned i = 0; i < Elts.size(); ++i) {
3904 Ops.push_back(Elts[i]);
3905 Ops.push_back(CurDAG->getTargetConstant(
3907 }
3908 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3909}
3910
3912AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3913 const SDLoc &DL) const {
3914 SmallVector<SDValue, 8> PackedElts;
3915 assert("unhandled Reg sequence size" &&
3916 (Elts.size() == 8 || Elts.size() == 16));
3917
3918 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3919 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3920 for (unsigned i = 0; i < Elts.size(); i += 2) {
3921 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3922 SDValue HiSrc;
3923 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3924 PackedElts.push_back(HiSrc);
3925 } else {
3926 if (Subtarget->useRealTrue16Insts()) {
3927 // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before
3928 // passing to v_perm_b32. Eventually we should use replace v_perm_b32
3929 // by reg_sequence.
3931 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i16),
3932 0);
3933 Elts[i] =
3934 emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID, MVT::i32,
3935 {Elts[i], Undef}, {AMDGPU::lo16, AMDGPU::hi16}, DL);
3936 Elts[i + 1] = emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID,
3937 MVT::i32, {Elts[i + 1], Undef},
3938 {AMDGPU::lo16, AMDGPU::hi16}, DL);
3939 }
3940 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3941 MachineSDNode *Packed =
3942 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3943 {Elts[i + 1], Elts[i], PackLoLo});
3944 PackedElts.push_back(SDValue(Packed, 0));
3945 }
3946 }
3947 return buildRegSequence32(PackedElts, DL);
3948}
3949
3951AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3952 const SDLoc &DL,
3953 unsigned ElementSize) const {
3954 if (ElementSize == 16)
3955 return buildRegSequence16(Elts, DL);
3956 if (ElementSize == 32)
3957 return buildRegSequence32(Elts, DL);
3958 llvm_unreachable("Unhandled element size");
3959}
3960
3961void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,
3962 unsigned &Mods,
3964 SDValue &Src, const SDLoc &DL,
3965 unsigned ElementSize) const {
3966 if (ModOpcode == ISD::FNEG) {
3967 Mods |= SISrcMods::NEG;
3968 // Check if all elements also have abs modifier
3969 SmallVector<SDValue, 8> NegAbsElts;
3970 for (auto El : Elts) {
3971 if (El.getOpcode() != ISD::FABS)
3972 break;
3973 NegAbsElts.push_back(El->getOperand(0));
3974 }
3975 if (Elts.size() != NegAbsElts.size()) {
3976 // Neg
3977 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3978 } else {
3979 // Neg and Abs
3980 Mods |= SISrcMods::NEG_HI;
3981 Src = SDValue(buildRegSequence(NegAbsElts, DL, ElementSize), 0);
3982 }
3983 } else {
3984 assert(ModOpcode == ISD::FABS);
3985 // Abs
3986 Mods |= SISrcMods::NEG_HI;
3987 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3988 }
3989}
3990
3991// Check all f16 elements for modifiers while looking through b32 and v2b16
3992// build vector, stop if element does not satisfy ModifierCheck.
3993static void
3995 std::function<bool(SDValue)> ModifierCheck) {
3996 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3997 if (auto *F16Pair =
3998 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3999 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
4000 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
4001 if (!ModifierCheck(ElF16))
4002 break;
4003 }
4004 }
4005 }
4006}
4007
4008bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
4009 SDValue &SrcMods) const {
4010 Src = In;
4011 unsigned Mods = SISrcMods::OP_SEL_1;
4012
4013 // mods are on f16 elements
4014 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4016
4017 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
4018 if (Element.getOpcode() != ISD::FNEG)
4019 return false;
4020 EltsF16.push_back(Element.getOperand(0));
4021 return true;
4022 });
4023
4024 // All elements have neg modifier
4025 if (BV->getNumOperands() * 2 == EltsF16.size()) {
4026 Src = SDValue(buildRegSequence16(EltsF16, SDLoc(In)), 0);
4027 Mods |= SISrcMods::NEG;
4028 Mods |= SISrcMods::NEG_HI;
4029 }
4030 }
4031
4032 // mods are on v2f16 elements
4033 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4034 SmallVector<SDValue, 8> EltsV2F16;
4035 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4036 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
4037 // Based on first element decide which mod we match, neg or abs
4038 if (ElV2f16.getOpcode() != ISD::FNEG)
4039 break;
4040 EltsV2F16.push_back(ElV2f16.getOperand(0));
4041 }
4042
4043 // All pairs of elements have neg modifier
4044 if (BV->getNumOperands() == EltsV2F16.size()) {
4045 Src = SDValue(buildRegSequence32(EltsV2F16, SDLoc(In)), 0);
4046 Mods |= SISrcMods::NEG;
4047 Mods |= SISrcMods::NEG_HI;
4048 }
4049 }
4050
4051 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4052 return true;
4053}
4054
4055bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
4056 SDValue &SrcMods) const {
4057 Src = In;
4058 unsigned Mods = SISrcMods::OP_SEL_1;
4059 unsigned ModOpcode;
4060
4061 // mods are on f16 elements
4062 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4064 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
4065 // Based on first element decide which mod we match, neg or abs
4066 if (EltsF16.empty())
4067 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4068 if (ElF16.getOpcode() != ModOpcode)
4069 return false;
4070 EltsF16.push_back(ElF16.getOperand(0));
4071 return true;
4072 });
4073
4074 // All elements have ModOpcode modifier
4075 if (BV->getNumOperands() * 2 == EltsF16.size())
4076 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, SDLoc(In), 16);
4077 }
4078
4079 // mods are on v2f16 elements
4080 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4081 SmallVector<SDValue, 8> EltsV2F16;
4082
4083 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4084 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
4085 // Based on first element decide which mod we match, neg or abs
4086 if (EltsV2F16.empty())
4087 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4088 if (ElV2f16->getOpcode() != ModOpcode)
4089 break;
4090 EltsV2F16.push_back(ElV2f16->getOperand(0));
4091 }
4092
4093 // All elements have ModOpcode modifier
4094 if (BV->getNumOperands() == EltsV2F16.size())
4095 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, SDLoc(In), 32);
4096 }
4097
4098 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4099 return true;
4100}
4101
4102bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
4103 SDValue &SrcMods) const {
4104 Src = In;
4105 unsigned Mods = SISrcMods::OP_SEL_1;
4107
4108 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4109 assert(BV->getNumOperands() > 0);
4110 // Based on first element decide which mod we match, neg or abs
4111 SDValue ElF32 = stripBitcast(BV->getOperand(0));
4112 unsigned ModOpcode =
4113 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4114 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4115 SDValue ElF32 = stripBitcast(BV->getOperand(i));
4116 if (ElF32.getOpcode() != ModOpcode)
4117 break;
4118 EltsF32.push_back(ElF32.getOperand(0));
4119 }
4120
4121 // All elements had ModOpcode modifier
4122 if (BV->getNumOperands() == EltsF32.size())
4123 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, SDLoc(In), 32);
4124 }
4125
4126 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4127 return true;
4128}
4129
4130bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
4131 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
4132 BitVector UndefElements;
4133 if (SDValue Splat = BV->getSplatValue(&UndefElements))
4134 if (isInlineImmediate(Splat.getNode())) {
4135 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
4136 unsigned Imm = C->getAPIntValue().getSExtValue();
4137 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4138 return true;
4139 }
4140 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
4141 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
4142 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4143 return true;
4144 }
4145 llvm_unreachable("unhandled Constant node");
4146 }
4147 }
4148
4149 // 16 bit splat
4150 SDValue SplatSrc32 = stripBitcast(In);
4151 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
4152 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
4153 SDValue SplatSrc16 = stripBitcast(Splat32);
4154 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
4155 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
4156 const SIInstrInfo *TII = Subtarget->getInstrInfo();
4157 std::optional<APInt> RawValue;
4158 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
4159 RawValue = C->getValueAPF().bitcastToAPInt();
4160 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
4161 RawValue = C->getAPIntValue();
4162
4163 if (RawValue.has_value()) {
4164 EVT VT = In.getValueType().getScalarType();
4165 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
4166 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
4169 RawValue.value());
4170 if (TII->isInlineConstant(FloatVal)) {
4171 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4172 MVT::i16);
4173 return true;
4174 }
4175 } else if (VT.getSimpleVT() == MVT::i16) {
4176 if (TII->isInlineConstant(RawValue.value())) {
4177 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4178 MVT::i16);
4179 return true;
4180 }
4181 } else
4182 llvm_unreachable("unknown 16-bit type");
4183 }
4184 }
4185 }
4186
4187 // Currently f64 immediate vectors are represented as vectors of v2i32, with
4188 // different lo and hi 32-bit values even though double values are splated.
4189 // So we have to manually compare to determine whether it is splated.
4190 if (CurDAG->isConstantIntBuildVectorOrConstantInt(SplatSrc32)) {
4191 int64_t Imm64 = 0;
4192 for (unsigned i = 0; i < SplatSrc32->getNumOperands(); i += 2) {
4193 auto Lo32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i));
4194 auto Hi32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i + 1));
4195 int64_t LoImm = Lo32->getAPIntValue().getSExtValue();
4196 int64_t HiImm = Hi32->getAPIntValue().getSExtValue();
4197 int64_t Imm64I = (HiImm << 32) + LoImm;
4198 if (i == 0) {
4199 if (!isInlineImmediate(APInt(64, Imm64I)))
4200 return false;
4201 Imm64 = Imm64I;
4202 } else if (Imm64I != Imm64)
4203 return false;
4204 } // end for
4205
4206 Src = CurDAG->getTargetConstant(Imm64, SDLoc(In), MVT::i64);
4207 return true;
4208 }
4209
4210 return false;
4211}
4212
4213bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4214 SDValue &IndexKey) const {
4215 unsigned Key = 0;
4216 Src = In;
4217
4218 if (In.getOpcode() == ISD::SRL) {
4219 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4220 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4221 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4222 ShiftAmt->getZExtValue() % 8 == 0) {
4223 Key = ShiftAmt->getZExtValue() / 8;
4224 Src = ShiftSrc;
4225 }
4226 }
4227
4228 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4229 return true;
4230}
4231
4232bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4233 SDValue &IndexKey) const {
4234 unsigned Key = 0;
4235 Src = In;
4236
4237 if (In.getOpcode() == ISD::SRL) {
4238 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4239 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4240 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4241 ShiftAmt->getZExtValue() == 16) {
4242 Key = 1;
4243 Src = ShiftSrc;
4244 }
4245 }
4246
4247 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4248 return true;
4249}
4250
4251bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4252 SDValue &IndexKey) const {
4253 unsigned Key = 0;
4254 Src = In;
4255
4256 SDValue InI32;
4257
4258 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4259 const SDValue &ExtendSrc = In.getOperand(0);
4260 if (ExtendSrc.getValueSizeInBits() == 32)
4261 InI32 = ExtendSrc;
4262 } else if (In->getOpcode() == ISD::BITCAST) {
4263 const SDValue &CastSrc = In.getOperand(0);
4264 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4265 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4266 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4267 if (Zero && Zero->getZExtValue() == 0)
4268 InI32 = CastSrc.getOperand(0);
4269 }
4270 }
4271
4272 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4273 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4274 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4275 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4276 EltIdx->getZExtValue() == 1) {
4277 Key = 1;
4278 Src = ExtractVecEltSrc;
4279 }
4280 }
4281
4282 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4283 return true;
4284}
4285
4286bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4287 SDValue &SrcMods) const {
4288 Src = In;
4289 // FIXME: Handle op_sel
4290 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4291 return true;
4292}
4293
4294bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4295 SDValue &SrcMods) const {
4296 // FIXME: Handle op_sel
4297 return SelectVOP3Mods(In, Src, SrcMods);
4298}
4299
4300// Match lowered fpext from bf16 to f32. This is a bit operation extending
4301// a 16-bit value with 16-bit of zeroes at LSB:
4302//
4303// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4304// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4305// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4306static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4307 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4308 return SDValue();
4309 Op = Op.getOperand(0);
4310
4311 IsExtractHigh = false;
4312 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4313 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4314 if (!Low16 || !Low16->isZero())
4315 return SDValue();
4316 Op = stripBitcast(Op.getOperand(1));
4317 if (Op.getValueType() != MVT::bf16)
4318 return SDValue();
4319 return Op;
4320 }
4321
4322 if (Op.getValueType() != MVT::i32)
4323 return SDValue();
4324
4325 if (Op.getOpcode() == ISD::AND) {
4326 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4327 if (Mask->getZExtValue() == 0xffff0000) {
4328 IsExtractHigh = true;
4329 return Op.getOperand(0);
4330 }
4331 }
4332 return SDValue();
4333 }
4334
4335 if (Op.getOpcode() == ISD::SHL) {
4336 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4337 if (Amt->getZExtValue() == 16)
4338 return Op.getOperand(0);
4339 }
4340 }
4341
4342 return SDValue();
4343}
4344
4345// The return value is not whether the match is possible (which it always is),
4346// but whether or not it a conversion is really used.
4347bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4348 unsigned &Mods,
4349 MVT VT) const {
4350 Mods = 0;
4351 SelectVOP3ModsImpl(In, Src, Mods);
4352
4353 bool IsExtractHigh = false;
4354 if (Src.getOpcode() == ISD::FP_EXTEND) {
4355 Src = Src.getOperand(0);
4356 } else if (VT == MVT::bf16) {
4357 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4358 if (!B16)
4359 return false;
4360 Src = B16;
4361 } else
4362 return false;
4363
4364 if (Src.getValueType() != VT &&
4365 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4366 return false;
4367
4368 Src = stripBitcast(Src);
4369
4370 // Be careful about folding modifiers if we already have an abs. fneg is
4371 // applied last, so we don't want to apply an earlier fneg.
4372 if ((Mods & SISrcMods::ABS) == 0) {
4373 unsigned ModsTmp;
4374 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4375
4376 if ((ModsTmp & SISrcMods::NEG) != 0)
4377 Mods ^= SISrcMods::NEG;
4378
4379 if ((ModsTmp & SISrcMods::ABS) != 0)
4380 Mods |= SISrcMods::ABS;
4381 }
4382
4383 // op_sel/op_sel_hi decide the source type and source.
4384 // If the source's op_sel_hi is set, it indicates to do a conversion from
4385 // fp16. If the sources's op_sel is set, it picks the high half of the source
4386 // register.
4387
4388 Mods |= SISrcMods::OP_SEL_1;
4389 if (Src.getValueSizeInBits() == 16) {
4390 if (isExtractHiElt(Src, Src)) {
4391 Mods |= SISrcMods::OP_SEL_0;
4392
4393 // TODO: Should we try to look for neg/abs here?
4394 return true;
4395 }
4396
4397 if (Src.getOpcode() == ISD::TRUNCATE &&
4398 Src.getOperand(0).getValueType() == MVT::i32) {
4399 Src = Src.getOperand(0);
4400 return true;
4401 }
4402
4403 if (Subtarget->useRealTrue16Insts())
4404 // In true16 mode, pack src to a 32bit
4405 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4406 } else if (IsExtractHigh)
4407 Mods |= SISrcMods::OP_SEL_0;
4408
4409 return true;
4410}
4411
4412bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4413 SDValue &SrcMods) const {
4414 unsigned Mods = 0;
4415 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4416 return false;
4417 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4418 return true;
4419}
4420
4421bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4422 SDValue &SrcMods) const {
4423 unsigned Mods = 0;
4424 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4425 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4426 return true;
4427}
4428
4429bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4430 SDValue &SrcMods) const {
4431 unsigned Mods = 0;
4432 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4433 return false;
4434 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4435 return true;
4436}
4437
4438bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4439 SDValue &SrcMods) const {
4440 unsigned Mods = 0;
4441 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4442 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4443 return true;
4444}
4445
4446// Match BITOP3 operation and return a number of matched instructions plus
4447// truth table.
4448static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4450 unsigned NumOpcodes = 0;
4451 uint8_t LHSBits, RHSBits;
4452
4453 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4454 // Define truth table given Src0, Src1, Src2 bits permutations:
4455 // 0 0 0
4456 // 0 0 1
4457 // 0 1 0
4458 // 0 1 1
4459 // 1 0 0
4460 // 1 0 1
4461 // 1 1 0
4462 // 1 1 1
4463 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4464
4465 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4466 if (C->isAllOnes()) {
4467 Bits = 0xff;
4468 return true;
4469 }
4470 if (C->isZero()) {
4471 Bits = 0;
4472 return true;
4473 }
4474 }
4475
4476 for (unsigned I = 0; I < Src.size(); ++I) {
4477 // Try to find existing reused operand
4478 if (Src[I] == Op) {
4479 Bits = SrcBits[I];
4480 return true;
4481 }
4482 // Try to replace parent operator
4483 if (Src[I] == In) {
4484 Bits = SrcBits[I];
4485 Src[I] = Op;
4486 return true;
4487 }
4488 }
4489
4490 if (Src.size() == 3) {
4491 // No room left for operands. Try one last time, there can be a 'not' of
4492 // one of our source operands. In this case we can compute the bits
4493 // without growing Src vector.
4494 if (Op.getOpcode() == ISD::XOR) {
4495 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4496 if (C->isAllOnes()) {
4497 SDValue LHS = Op.getOperand(0);
4498 for (unsigned I = 0; I < Src.size(); ++I) {
4499 if (Src[I] == LHS) {
4500 Bits = ~SrcBits[I];
4501 return true;
4502 }
4503 }
4504 }
4505 }
4506 }
4507
4508 return false;
4509 }
4510
4511 Bits = SrcBits[Src.size()];
4512 Src.push_back(Op);
4513 return true;
4514 };
4515
4516 switch (In.getOpcode()) {
4517 case ISD::AND:
4518 case ISD::OR:
4519 case ISD::XOR: {
4520 SDValue LHS = In.getOperand(0);
4521 SDValue RHS = In.getOperand(1);
4522
4523 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4524 if (!getOperandBits(LHS, LHSBits) ||
4525 !getOperandBits(RHS, RHSBits)) {
4526 Src = std::move(Backup);
4527 return std::make_pair(0, 0);
4528 }
4529
4530 // Recursion is naturally limited by the size of the operand vector.
4531 //
4532 // When LHS and RHS share a common sub-expression, one side's recursion
4533 // may decompose that sub-expression and replace the Src slot the other
4534 // side occupies with sub-operands via the "replace parent" path in
4535 // getOperandBits. The other side's cached bit-pattern then refers to a
4536 // slot whose contents changed, producing a wrong truth table.
4537 //
4538 // We detect this in three ways:
4539 // (A) If LHS recursed, its truth table is valid against the Src state
4540 // when LHS recursion completed (SrcAfterLHS). If RHS recursion
4541 // then mutates a Src slot that LHSBits depends on, LHSBits is
4542 // stale.
4543 // (B) If RHS did not recurse, RHSBits came from getOperandBits and
4544 // refers to a specific Src slot. If that slot's contents changed
4545 // (by either recursion), RHSBits is stale.
4546 // (C) Symmetrically for LHS if it did not recurse.
4547 SmallVector<SDValue, 3> SrcBeforeRecurse(Src.begin(), Src.end());
4548 uint8_t LHSBitsOrig = LHSBits;
4549 uint8_t RHSBitsOrig = RHSBits;
4550
4551 auto LHSOp = BitOp3_Op(LHS, Src);
4552 if (LHSOp.first) {
4553 NumOpcodes += LHSOp.first;
4554 LHSBits = LHSOp.second;
4555 }
4556
4557 SmallVector<SDValue, 3> SrcAfterLHS(Src.begin(), Src.end());
4558
4559 auto RHSOp = BitOp3_Op(RHS, Src);
4560 if (RHSOp.first) {
4561 NumOpcodes += RHSOp.first;
4562 RHSBits = RHSOp.second;
4563 }
4564
4565 // dependsOnSlot: true iff the truth table TT varies with slot Slot.
4566 auto dependsOnSlot = [](uint8_t TT, int Slot) -> bool {
4567 if (Slot < 0 || Slot > 2)
4568 return false;
4569 const uint8_t Masks[3] = {0x0f, 0x33, 0x55};
4570 const int Shifts[3] = {4, 2, 1};
4571 return ((TT ^ (TT >> Shifts[Slot])) & Masks[Slot]) != 0;
4572 };
4573
4574 // findSlot: locate the Src slot a getOperandBits result depends on,
4575 // including negated (XOR with -1) patterns that getOperandBits
4576 // resolves via the NOT shortcut (~SrcBits[I]).
4577 const uint8_t SrcBitsConst[3] = {0xf0, 0xcc, 0xaa};
4578 auto findSlot = [&](uint8_t Bits, SDValue Op,
4579 const SmallVectorImpl<SDValue> &S) -> int {
4580 SDValue NegatedInner;
4581 bool IsNegationOp =
4582 Op.getOpcode() == ISD::XOR && isAllOnesConstant(Op.getOperand(1));
4583 if (IsNegationOp)
4584 NegatedInner = Op.getOperand(0);
4585 for (int I = 0; I < (int)S.size(); I++) {
4586 if (Bits == SrcBitsConst[I] && S[I] == Op)
4587 return I;
4588 if (IsNegationOp && Bits == (uint8_t)~SrcBitsConst[I] &&
4589 S[I] == NegatedInner)
4590 return I;
4591 }
4592 return -1;
4593 };
4594
4595 bool Stale = false;
4596
4597 // (A) LHS recursed: its truth table is against SrcAfterLHS.
4598 // Check if RHS recursion mutated a slot that LHSBits uses.
4599 if (LHSOp.first) {
4600 for (int I = 0; I < (int)SrcAfterLHS.size() && I < 3; I++) {
4601 if (I < (int)Src.size() && Src[I] != SrcAfterLHS[I] &&
4602 dependsOnSlot(LHSBits, I)) {
4603 Stale = true;
4604 break;
4605 }
4606 }
4607 }
4608
4609 // (B) RHS did not recurse: RHSBits from getOperandBits is against
4610 // SrcBeforeRecurse. Check if that slot was mutated since then.
4611 if (!Stale && !RHSOp.first) {
4612 int Slot = findSlot(RHSBitsOrig, RHS, SrcBeforeRecurse);
4613 if (Slot >= 0 &&
4614 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4615 Stale = true;
4616 }
4617
4618 // (C) LHS did not recurse: LHSBits from getOperandBits is against
4619 // SrcBeforeRecurse. Check if that slot was mutated since then.
4620 if (!Stale && !LHSOp.first) {
4621 int Slot = findSlot(LHSBitsOrig, LHS, SrcBeforeRecurse);
4622 if (Slot >= 0 &&
4623 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4624 Stale = true;
4625 }
4626
4627 if (Stale) {
4628 Src = std::move(SrcBeforeRecurse);
4629 LHSBits = LHSBitsOrig;
4630 RHSBits = RHSBitsOrig;
4631 NumOpcodes = 0;
4632 }
4633 break;
4634 }
4635 default:
4636 return std::make_pair(0, 0);
4637 }
4638
4639 uint8_t TTbl;
4640 switch (In.getOpcode()) {
4641 case ISD::AND:
4642 TTbl = LHSBits & RHSBits;
4643 break;
4644 case ISD::OR:
4645 TTbl = LHSBits | RHSBits;
4646 break;
4647 case ISD::XOR:
4648 TTbl = LHSBits ^ RHSBits;
4649 break;
4650 default:
4651 break;
4652 }
4653
4654 return std::make_pair(NumOpcodes + 1, TTbl);
4655}
4656
4657bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4658 SDValue &Src2, SDValue &Tbl) const {
4660 uint8_t TTbl;
4661 unsigned NumOpcodes;
4662
4663 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4664
4665 // Src.empty() case can happen if all operands are all zero or all ones.
4666 // Normally it shall be optimized out before reaching this.
4667 if (NumOpcodes < 2 || Src.empty())
4668 return false;
4669
4670 // For a uniform case threshold should be higher to account for moves between
4671 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4672 // and a readtfirstlane after.
4673 if (NumOpcodes < 4 && !In->isDivergent())
4674 return false;
4675
4676 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4677 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4678 // asm more readable. This cannot be modeled with AddedComplexity because
4679 // selector does not know how many operations did we match.
4680 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4681 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4682 In.getOperand(1).getOpcode() == In.getOpcode()))
4683 return false;
4684
4685 if (In.getOpcode() == ISD::OR &&
4686 (In.getOperand(0).getOpcode() == ISD::AND ||
4687 In.getOperand(1).getOpcode() == ISD::AND))
4688 return false;
4689 }
4690
4691 // Last operand can be ignored, turning a ternary operation into a binary.
4692 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4693 // 'c' with 'a' here without changing the answer. In some pathological
4694 // cases it should be possible to get an operation with a single operand
4695 // too if optimizer would not catch it.
4696 while (Src.size() < 3)
4697 Src.push_back(Src[0]);
4698
4699 Src0 = Src[0];
4700 Src1 = Src[1];
4701 Src2 = Src[2];
4702
4703 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4704 return true;
4705}
4706
4707SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4708 if (In.isUndef())
4709 return CurDAG->getUNDEF(MVT::i32);
4710
4711 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4712 SDLoc SL(In);
4713 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4714 }
4715
4716 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4717 SDLoc SL(In);
4718 return CurDAG->getConstant(
4719 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4720 }
4721
4722 SDValue Src;
4723 if (isExtractHiElt(In, Src))
4724 return Src;
4725
4726 return SDValue();
4727}
4728
4729bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4730 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4731
4732 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4733 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4734
4735 unsigned Limit = 0;
4736 bool AllUsesAcceptSReg = true;
4737 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4738 Limit < 10 && U != E; ++U, ++Limit) {
4739 const TargetRegisterClass *RC =
4740 getOperandRegClass(U->getUser(), U->getOperandNo());
4741
4742 // If the register class is unknown, it could be an unknown
4743 // register class that needs to be an SGPR, e.g. an inline asm
4744 // constraint
4745 if (!RC || SIRI->isSGPRClass(RC))
4746 return false;
4747
4748 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4749 RC != &AMDGPU::VS_64_Align2RegClass) {
4750 AllUsesAcceptSReg = false;
4751 SDNode *User = U->getUser();
4752 if (User->isMachineOpcode()) {
4753 unsigned Opc = User->getMachineOpcode();
4754 const MCInstrDesc &Desc = SII->get(Opc);
4755 if (Desc.isCommutable()) {
4756 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4757 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4758 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4759 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4760 const TargetRegisterClass *CommutedRC =
4761 getOperandRegClass(U->getUser(), CommutedOpNo);
4762 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4763 CommutedRC == &AMDGPU::VS_64RegClass ||
4764 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4765 AllUsesAcceptSReg = true;
4766 }
4767 }
4768 }
4769 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4770 // commuting current user. This means have at least one use
4771 // that strictly require VGPR. Thus, we will not attempt to commute
4772 // other user instructions.
4773 if (!AllUsesAcceptSReg)
4774 break;
4775 }
4776 }
4777 return !AllUsesAcceptSReg && (Limit < 10);
4778}
4779
4780bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4781 const auto *Ld = cast<LoadSDNode>(N);
4782 const MachineMemOperand *MMO = Ld->getMemOperand();
4783
4784 // FIXME: We ought to able able to take the direct isDivergent result. We
4785 // cannot rely on the MMO for a uniformity check, and should stop using
4786 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4787 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4788 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4789 // version, and then this can be dropped.
4790 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4791 return false;
4792
4793 return MMO->getSize().hasValue() &&
4794 Ld->getAlign() >=
4795 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4796 uint64_t(4))) &&
4797 (MMO->isInvariant() ||
4798 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4799 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4800 (Subtarget->getScalarizeGlobalBehavior() &&
4801 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4802 Ld->isSimple() &&
4803 static_cast<const SITargetLowering *>(getTargetLowering())
4804 ->isMemOpHasNoClobberedMemOperand(N)));
4805}
4806
4809 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4810 bool IsModified = false;
4811 do {
4812 IsModified = false;
4813
4814 // Go over all selected nodes and try to fold them a bit more
4815 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4816 while (Position != CurDAG->allnodes_end()) {
4817 SDNode *Node = &*Position++;
4819 if (!MachineNode)
4820 continue;
4821
4822 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4823 if (ResNode != Node) {
4824 if (ResNode)
4825 ReplaceUses(Node, ResNode);
4826 IsModified = true;
4827 }
4828 }
4829 CurDAG->RemoveDeadNodes();
4830 } while (IsModified);
4831}
4832
4837
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:119
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
bool isSDWAOperand(const SDNode *N) const
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static SDValue stripBitcast(SDValue Val)
static const fltSemantics & BFloat()
Definition APFloat.h:296
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
LLVM_ABI PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ TargetFrameIndex
Definition ISDOpcodes.h:187
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ BRCOND
BRCOND - Conditional branch.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:860
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.