LLVM 22.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132} // end anonymous namespace
133
135 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
136 false)
138INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
140#ifdef EXPENSIVE_CHECKS
143#endif
145 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
146 false)
147
148/// This pass converts a legalized DAG into a AMDGPU-specific
149// DAG, ready for instruction scheduling.
151 CodeGenOptLevel OptLevel) {
152 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
153}
154
158
160 Subtarget = &MF.getSubtarget<GCNSubtarget>();
161 Subtarget->checkSubtargetFeatures(MF.getFunction());
162 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
164}
165
166bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
167 // XXX - only need to list legal operations.
168 switch (Opc) {
169 case ISD::FADD:
170 case ISD::FSUB:
171 case ISD::FMUL:
172 case ISD::FDIV:
173 case ISD::FREM:
175 case ISD::UINT_TO_FP:
176 case ISD::SINT_TO_FP:
177 case ISD::FABS:
178 // Fabs is lowered to a bit operation, but it's an and which will clear the
179 // high bits anyway.
180 case ISD::FSQRT:
181 case ISD::FSIN:
182 case ISD::FCOS:
183 case ISD::FPOWI:
184 case ISD::FPOW:
185 case ISD::FLOG:
186 case ISD::FLOG2:
187 case ISD::FLOG10:
188 case ISD::FEXP:
189 case ISD::FEXP2:
190 case ISD::FCEIL:
191 case ISD::FTRUNC:
192 case ISD::FRINT:
193 case ISD::FNEARBYINT:
194 case ISD::FROUNDEVEN:
195 case ISD::FROUND:
196 case ISD::FFLOOR:
197 case ISD::FMINNUM:
198 case ISD::FMAXNUM:
199 case ISD::FLDEXP:
200 case AMDGPUISD::FRACT:
201 case AMDGPUISD::CLAMP:
202 case AMDGPUISD::COS_HW:
203 case AMDGPUISD::SIN_HW:
204 case AMDGPUISD::FMIN3:
205 case AMDGPUISD::FMAX3:
206 case AMDGPUISD::FMED3:
207 case AMDGPUISD::FMAD_FTZ:
208 case AMDGPUISD::RCP:
209 case AMDGPUISD::RSQ:
210 case AMDGPUISD::RCP_IFLAG:
211 // On gfx10, all 16-bit instructions preserve the high bits.
212 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
213 case ISD::FP_ROUND:
214 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
215 // high bits on gfx9.
216 // TODO: If we had the source node we could see if the source was fma/mad
218 case ISD::FMA:
219 case ISD::FMAD:
220 case AMDGPUISD::DIV_FIXUP:
222 default:
223 // fcopysign, select and others may be lowered to 32-bit bit operations
224 // which don't zero the high bits.
225 return false;
226 }
227}
228
230#ifdef EXPENSIVE_CHECKS
232 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
233 for (auto &L : LI->getLoopsInPreorder()) {
234 assert(L->isLCSSAForm(DT));
235 }
236#endif
238}
239
249
251 assert(Subtarget->d16PreservesUnusedBits());
252 MVT VT = N->getValueType(0).getSimpleVT();
253 if (VT != MVT::v2i16 && VT != MVT::v2f16)
254 return false;
255
256 SDValue Lo = N->getOperand(0);
257 SDValue Hi = N->getOperand(1);
258
259 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
260
261 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
262 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
263 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
264
265 // Need to check for possible indirect dependencies on the other half of the
266 // vector to avoid introducing a cycle.
267 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
268 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
269
270 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
271 SDValue Ops[] = {
272 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
273 };
274
275 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
276 if (LdHi->getMemoryVT() == MVT::i8) {
277 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
278 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
279 } else {
280 assert(LdHi->getMemoryVT() == MVT::i16);
281 }
282
283 SDValue NewLoadHi =
284 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
285 Ops, LdHi->getMemoryVT(),
286 LdHi->getMemOperand());
287
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
290 return true;
291 }
292
293 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
294 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
295 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
296 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
297 if (LdLo && Lo.hasOneUse()) {
298 SDValue TiedIn = getHi16Elt(Hi);
299 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
300 return false;
301
302 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
303 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
304 if (LdLo->getMemoryVT() == MVT::i8) {
305 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
306 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
307 } else {
308 assert(LdLo->getMemoryVT() == MVT::i16);
309 }
310
311 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
312
313 SDValue Ops[] = {
314 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
315 };
316
317 SDValue NewLoadLo =
318 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
319 Ops, LdLo->getMemoryVT(),
320 LdLo->getMemOperand());
321
322 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
323 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
324 return true;
325 }
326
327 return false;
328}
329
331 if (!Subtarget->d16PreservesUnusedBits())
332 return;
333
334 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
335
336 bool MadeChange = false;
337 while (Position != CurDAG->allnodes_begin()) {
338 SDNode *N = &*--Position;
339 if (N->use_empty())
340 continue;
341
342 switch (N->getOpcode()) {
344 // TODO: Match load d16 from shl (extload:i16), 16
345 MadeChange |= matchLoadD16FromBuildVector(N);
346 break;
347 default:
348 break;
349 }
350 }
351
352 if (MadeChange) {
353 CurDAG->RemoveDeadNodes();
354 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
355 CurDAG->dump(););
356 }
357}
358
359bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
360 if (N->isUndef())
361 return true;
362
363 const SIInstrInfo *TII = Subtarget->getInstrInfo();
365 return TII->isInlineConstant(C->getAPIntValue());
366
368 return TII->isInlineConstant(C->getValueAPF());
369
370 return false;
371}
372
373/// Determine the register class for \p OpNo
374/// \returns The register class of the virtual register that will be used for
375/// the given operand number \OpNo or NULL if the register class cannot be
376/// determined.
377const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
378 unsigned OpNo) const {
379 if (!N->isMachineOpcode()) {
380 if (N->getOpcode() == ISD::CopyToReg) {
381 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
382 if (Reg.isVirtual()) {
384 return MRI.getRegClass(Reg);
385 }
386
387 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
388 return TRI->getPhysRegBaseClass(Reg);
389 }
390
391 return nullptr;
392 }
393
394 switch (N->getMachineOpcode()) {
395 default: {
396 const SIInstrInfo *TII = Subtarget->getInstrInfo();
397 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
398 unsigned OpIdx = Desc.getNumDefs() + OpNo;
399 if (OpIdx >= Desc.getNumOperands())
400 return nullptr;
401
402 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
403 if (RegClass == -1)
404 return nullptr;
405
406 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
407 }
408 case AMDGPU::REG_SEQUENCE: {
409 unsigned RCID = N->getConstantOperandVal(0);
410 const TargetRegisterClass *SuperRC =
411 Subtarget->getRegisterInfo()->getRegClass(RCID);
412
413 SDValue SubRegOp = N->getOperand(OpNo + 1);
414 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
415 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
416 SubRegIdx);
417 }
418 }
419}
420
421SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
422 SDValue Glue) const {
424 Ops.push_back(NewChain); // Replace the chain.
425 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
426 Ops.push_back(N->getOperand(i));
427
428 Ops.push_back(Glue);
429 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
430}
431
432SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
433 const SITargetLowering& Lowering =
434 *static_cast<const SITargetLowering*>(getTargetLowering());
435
436 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
437
438 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
439 return glueCopyToOp(N, M0, M0.getValue(1));
440}
441
442SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
443 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
444 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
445 if (Subtarget->ldsRequiresM0Init())
446 return glueCopyToM0(
447 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
448 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
449 MachineFunction &MF = CurDAG->getMachineFunction();
450 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
451 return
452 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
453 }
454 return N;
455}
456
457MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
458 EVT VT) const {
459 SDNode *Lo = CurDAG->getMachineNode(
460 AMDGPU::S_MOV_B32, DL, MVT::i32,
461 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
462 SDNode *Hi = CurDAG->getMachineNode(
463 AMDGPU::S_MOV_B32, DL, MVT::i32,
464 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
465 const SDValue Ops[] = {
466 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
467 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
468 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
469
470 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
471}
472
473SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
474 SelectionDAG &DAG) const {
475 // TODO: Handle undef as zero
476
477 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
478 uint32_t LHSVal, RHSVal;
479 if (getConstantValue(N->getOperand(0), LHSVal) &&
480 getConstantValue(N->getOperand(1), RHSVal)) {
481 SDLoc SL(N);
482 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
483 return DAG.getMachineNode(
484 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
485 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
486 }
487
488 return nullptr;
489}
490
491void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
492 EVT VT = N->getValueType(0);
493 unsigned NumVectorElts = VT.getVectorNumElements();
494 EVT EltVT = VT.getVectorElementType();
495 SDLoc DL(N);
496 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
497
498 if (NumVectorElts == 1) {
499 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
500 RegClass);
501 return;
502 }
503
504 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
505 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
506 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
507 uint64_t C = 0;
508 bool AllConst = true;
509 unsigned EltSize = EltVT.getSizeInBits();
510 for (unsigned I = 0; I < NumVectorElts; ++I) {
511 SDValue Op = N->getOperand(I);
512 if (Op.isUndef()) {
513 AllConst = false;
514 break;
515 }
516 uint64_t Val;
518 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
519 } else
520 Val = cast<ConstantSDNode>(Op)->getZExtValue();
521 C |= Val << (EltSize * I);
522 }
523 if (AllConst) {
524 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
525 MachineSDNode *Copy =
526 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
527 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
528 RegClass);
529 return;
530 }
531 }
532
533 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
534 "supported yet");
535 // 32 = Max Num Vector Elements
536 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
537 // 1 = Vector Register Class
538 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
539
540 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
541 bool IsRegSeq = true;
542 unsigned NOps = N->getNumOperands();
543 for (unsigned i = 0; i < NOps; i++) {
544 // XXX: Why is this here?
545 if (isa<RegisterSDNode>(N->getOperand(i))) {
546 IsRegSeq = false;
547 break;
548 }
549 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
551 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
552 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
553 }
554 if (NOps != NumVectorElts) {
555 // Fill in the missing undef elements if this was a scalar_to_vector.
556 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
557 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
558 DL, EltVT);
559 for (unsigned i = NOps; i < NumVectorElts; ++i) {
560 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
562 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
563 RegSeqArgs[1 + (2 * i) + 1] =
564 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
565 }
566 }
567
568 if (!IsRegSeq)
569 SelectCode(N);
570 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
571}
572
574 EVT VT = N->getValueType(0);
575 EVT EltVT = VT.getVectorElementType();
576
577 // TODO: Handle 16-bit element vectors with even aligned masks.
578 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
579 VT.getVectorNumElements() != 2) {
580 SelectCode(N);
581 return;
582 }
583
584 auto *SVN = cast<ShuffleVectorSDNode>(N);
585
586 SDValue Src0 = SVN->getOperand(0);
587 SDValue Src1 = SVN->getOperand(1);
588 ArrayRef<int> Mask = SVN->getMask();
589 SDLoc DL(N);
590
591 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
592 Mask[0] < 4 && Mask[1] < 4);
593
594 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
595 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
596 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
597 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
598
599 if (Mask[0] < 0) {
600 Src0SubReg = Src1SubReg;
601 MachineSDNode *ImpDef =
602 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
603 VSrc0 = SDValue(ImpDef, 0);
604 }
605
606 if (Mask[1] < 0) {
607 Src1SubReg = Src0SubReg;
608 MachineSDNode *ImpDef =
609 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
610 VSrc1 = SDValue(ImpDef, 0);
611 }
612
613 // SGPR case needs to lower to copies.
614 //
615 // Also use subregister extract when we can directly blend the registers with
616 // a simple subregister copy.
617 //
618 // TODO: Maybe we should fold this out earlier
619 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
620 Src1SubReg == AMDGPU::sub0) {
621 // The low element of the result always comes from src0.
622 // The high element of the result always comes from src1.
623 // op_sel selects the high half of src0.
624 // op_sel_hi selects the high half of src1.
625
626 unsigned Src0OpSel =
627 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
628 unsigned Src1OpSel =
629 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
630
631 // Enable op_sel_hi to avoid printing it. This should have no effect on the
632 // result.
633 Src0OpSel |= SISrcMods::OP_SEL_1;
634 Src1OpSel |= SISrcMods::OP_SEL_1;
635
636 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
637 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
638 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
639
640 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
641 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
642 ZeroMods, // clamp
643 ZeroMods, // op_sel
644 ZeroMods, // op_sel_hi
645 ZeroMods, // neg_lo
646 ZeroMods}); // neg_hi
647 return;
648 }
649
650 SDValue ResultElt0 =
651 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
652 SDValue ResultElt1 =
653 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
654
655 const SDValue Ops[] = {
656 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
657 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
658 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
659 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
660}
661
663 unsigned int Opc = N->getOpcode();
664 if (N->isMachineOpcode()) {
665 N->setNodeId(-1);
666 return; // Already selected.
667 }
668
669 // isa<MemSDNode> almost works but is slightly too permissive for some DS
670 // intrinsics.
671 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
672 N = glueCopyToM0LDSInit(N);
673 SelectCode(N);
674 return;
675 }
676
677 switch (Opc) {
678 default:
679 break;
680 // We are selecting i64 ADD here instead of custom lower it during
681 // DAG legalization, so we can fold some i64 ADDs used for address
682 // calculation into the LOAD and STORE instructions.
683 case ISD::ADDC:
684 case ISD::ADDE:
685 case ISD::SUBC:
686 case ISD::SUBE: {
687 if (N->getValueType(0) != MVT::i64)
688 break;
689
690 SelectADD_SUB_I64(N);
691 return;
692 }
693 case ISD::UADDO_CARRY:
694 case ISD::USUBO_CARRY:
695 if (N->getValueType(0) != MVT::i32)
696 break;
697
698 SelectAddcSubb(N);
699 return;
700 case ISD::UADDO:
701 case ISD::USUBO: {
702 SelectUADDO_USUBO(N);
703 return;
704 }
705 case AMDGPUISD::FMUL_W_CHAIN: {
706 SelectFMUL_W_CHAIN(N);
707 return;
708 }
709 case AMDGPUISD::FMA_W_CHAIN: {
710 SelectFMA_W_CHAIN(N);
711 return;
712 }
713
715 case ISD::BUILD_VECTOR: {
716 EVT VT = N->getValueType(0);
717 unsigned NumVectorElts = VT.getVectorNumElements();
718 if (VT.getScalarSizeInBits() == 16) {
719 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
720 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
721 ReplaceNode(N, Packed);
722 return;
723 }
724 }
725
726 break;
727 }
728
729 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
730 assert(VT.getVectorElementType().bitsEq(MVT::i32));
731 const TargetRegisterClass *RegClass =
732 N->isDivergent()
733 ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
734 : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
735
736 SelectBuildVector(N, RegClass->getID());
737 return;
738 }
741 return;
742 case ISD::BUILD_PAIR: {
743 SDValue RC, SubReg0, SubReg1;
744 SDLoc DL(N);
745 if (N->getValueType(0) == MVT::i128) {
746 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
747 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
748 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
749 } else if (N->getValueType(0) == MVT::i64) {
750 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
751 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
752 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
753 } else {
754 llvm_unreachable("Unhandled value type for BUILD_PAIR");
755 }
756 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
757 N->getOperand(1), SubReg1 };
758 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
759 N->getValueType(0), Ops));
760 return;
761 }
762
763 case ISD::Constant:
764 case ISD::ConstantFP: {
765 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
766 Subtarget->has64BitLiterals())
767 break;
768
769 uint64_t Imm;
771 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
772 if (AMDGPU::isValid32BitLiteral(Imm, true))
773 break;
774 } else {
776 Imm = C->getZExtValue();
777 if (AMDGPU::isValid32BitLiteral(Imm, false))
778 break;
779 }
780
781 SDLoc DL(N);
782 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
783 return;
784 }
785 case AMDGPUISD::BFE_I32:
786 case AMDGPUISD::BFE_U32: {
787 // There is a scalar version available, but unlike the vector version which
788 // has a separate operand for the offset and width, the scalar version packs
789 // the width and offset into a single operand. Try to move to the scalar
790 // version if the offsets are constant, so that we can try to keep extended
791 // loads of kernel arguments in SGPRs.
792
793 // TODO: Technically we could try to pattern match scalar bitshifts of
794 // dynamic values, but it's probably not useful.
796 if (!Offset)
797 break;
798
799 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
800 if (!Width)
801 break;
802
803 bool Signed = Opc == AMDGPUISD::BFE_I32;
804
805 uint32_t OffsetVal = Offset->getZExtValue();
806 uint32_t WidthVal = Width->getZExtValue();
807
808 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
809 WidthVal));
810 return;
811 }
812 case AMDGPUISD::DIV_SCALE: {
813 SelectDIV_SCALE(N);
814 return;
815 }
818 SelectMAD_64_32(N);
819 return;
820 }
821 case ISD::SMUL_LOHI:
822 case ISD::UMUL_LOHI:
823 return SelectMUL_LOHI(N);
824 case ISD::CopyToReg: {
826 *static_cast<const SITargetLowering*>(getTargetLowering());
827 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
828 break;
829 }
830 case ISD::AND:
831 case ISD::SRL:
832 case ISD::SRA:
834 if (N->getValueType(0) != MVT::i32)
835 break;
836
837 SelectS_BFE(N);
838 return;
839 case ISD::BRCOND:
840 SelectBRCOND(N);
841 return;
842 case ISD::FP_EXTEND:
843 SelectFP_EXTEND(N);
844 return;
845 case AMDGPUISD::CVT_PKRTZ_F16_F32:
846 case AMDGPUISD::CVT_PKNORM_I16_F32:
847 case AMDGPUISD::CVT_PKNORM_U16_F32:
848 case AMDGPUISD::CVT_PK_U16_U32:
849 case AMDGPUISD::CVT_PK_I16_I32: {
850 // Hack around using a legal type if f16 is illegal.
851 if (N->getValueType(0) == MVT::i32) {
852 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
853 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
854 { N->getOperand(0), N->getOperand(1) });
855 SelectCode(N);
856 return;
857 }
858
859 break;
860 }
862 SelectINTRINSIC_W_CHAIN(N);
863 return;
864 }
866 SelectINTRINSIC_WO_CHAIN(N);
867 return;
868 }
869 case ISD::INTRINSIC_VOID: {
870 SelectINTRINSIC_VOID(N);
871 return;
872 }
874 SelectWAVE_ADDRESS(N);
875 return;
876 }
877 case ISD::STACKRESTORE: {
878 SelectSTACKRESTORE(N);
879 return;
880 }
881 }
882
883 SelectCode(N);
884}
885
886bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
887 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
888 const Instruction *Term = BB->getTerminator();
889 return Term->getMetadata("amdgpu.uniform") ||
890 Term->getMetadata("structurizecfg.uniform");
891}
892
893bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
894 unsigned ShAmtBits) const {
895 assert(N->getOpcode() == ISD::AND);
896
897 const APInt &RHS = N->getConstantOperandAPInt(1);
898 if (RHS.countr_one() >= ShAmtBits)
899 return true;
900
901 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
902 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
903}
904
906 SDValue &N0, SDValue &N1) {
907 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
909 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
910 // (i64 (bitcast (v2i32 (build_vector
911 // (or (extract_vector_elt V, 0), OFFSET),
912 // (extract_vector_elt V, 1)))))
913 SDValue Lo = Addr.getOperand(0).getOperand(0);
914 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
915 SDValue BaseLo = Lo.getOperand(0);
916 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
917 // Check that split base (Lo and Hi) are extracted from the same one.
918 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
920 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
921 // Lo is statically extracted from index 0.
922 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
923 BaseLo.getConstantOperandVal(1) == 0 &&
924 // Hi is statically extracted from index 0.
925 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
926 BaseHi.getConstantOperandVal(1) == 1) {
927 N0 = BaseLo.getOperand(0).getOperand(0);
928 N1 = Lo.getOperand(1);
929 return true;
930 }
931 }
932 }
933 return false;
934}
935
936bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
937 SDValue &RHS) const {
938 if (CurDAG->isBaseWithConstantOffset(Addr)) {
939 LHS = Addr.getOperand(0);
940 RHS = Addr.getOperand(1);
941 return true;
942 }
943
946 return true;
947 }
948
949 return false;
950}
951
953 return "AMDGPU DAG->DAG Pattern Instruction Selection";
954}
955
959
963#ifdef EXPENSIVE_CHECKS
965 .getManager();
966 auto &F = MF.getFunction();
967 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
968 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
969 for (auto &L : LI.getLoopsInPreorder())
970 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
971#endif
972 return SelectionDAGISelPass::run(MF, MFAM);
973}
974
975//===----------------------------------------------------------------------===//
976// Complex Patterns
977//===----------------------------------------------------------------------===//
978
979bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
980 SDValue &Offset) {
981 return false;
982}
983
984bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
985 SDValue &Offset) {
987 SDLoc DL(Addr);
988
989 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
990 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
991 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
992 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
994 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
995 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
996 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
998 Base = Addr.getOperand(0);
999 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1000 } else {
1001 Base = Addr;
1002 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1003 }
1004
1005 return true;
1006}
1007
1008SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1009 const SDLoc &DL) const {
1010 SDNode *Mov = CurDAG->getMachineNode(
1011 AMDGPU::S_MOV_B32, DL, MVT::i32,
1012 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1013 return SDValue(Mov, 0);
1014}
1015
1016// FIXME: Should only handle uaddo_carry/usubo_carry
1017void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1018 SDLoc DL(N);
1019 SDValue LHS = N->getOperand(0);
1020 SDValue RHS = N->getOperand(1);
1021
1022 unsigned Opcode = N->getOpcode();
1023 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1024 bool ProduceCarry =
1025 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1026 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1027
1028 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1029 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1030
1031 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1032 DL, MVT::i32, LHS, Sub0);
1033 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1034 DL, MVT::i32, LHS, Sub1);
1035
1036 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1037 DL, MVT::i32, RHS, Sub0);
1038 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1039 DL, MVT::i32, RHS, Sub1);
1040
1041 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1042
1043 static const unsigned OpcMap[2][2][2] = {
1044 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1045 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1046 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1047 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1048
1049 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1050 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1051
1052 SDNode *AddLo;
1053 if (!ConsumeCarry) {
1054 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1055 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1056 } else {
1057 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1058 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1059 }
1060 SDValue AddHiArgs[] = {
1061 SDValue(Hi0, 0),
1062 SDValue(Hi1, 0),
1063 SDValue(AddLo, 1)
1064 };
1065 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1066
1067 SDValue RegSequenceArgs[] = {
1068 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1069 SDValue(AddLo,0),
1070 Sub0,
1071 SDValue(AddHi,0),
1072 Sub1,
1073 };
1074 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1075 MVT::i64, RegSequenceArgs);
1076
1077 if (ProduceCarry) {
1078 // Replace the carry-use
1079 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1080 }
1081
1082 // Replace the remaining uses.
1083 ReplaceNode(N, RegSequence);
1084}
1085
1086void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1087 SDValue LHS = N->getOperand(0);
1088 SDValue RHS = N->getOperand(1);
1089 SDValue CI = N->getOperand(2);
1090
1091 if (N->isDivergent()) {
1092 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1093 : AMDGPU::V_SUBB_U32_e64;
1094 CurDAG->SelectNodeTo(
1095 N, Opc, N->getVTList(),
1096 {LHS, RHS, CI,
1097 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1098 } else {
1099 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1100 : AMDGPU::S_SUB_CO_PSEUDO;
1101 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1102 }
1103}
1104
1105void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1106 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1107 // carry out despite the _i32 name. These were renamed in VI to _U32.
1108 // FIXME: We should probably rename the opcodes here.
1109 bool IsAdd = N->getOpcode() == ISD::UADDO;
1110 bool IsVALU = N->isDivergent();
1111
1112 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1113 ++UI)
1114 if (UI.getUse().getResNo() == 1) {
1115 if (UI->isMachineOpcode()) {
1116 if (UI->getMachineOpcode() !=
1117 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1118 IsVALU = true;
1119 break;
1120 }
1121 } else {
1122 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1123 IsVALU = true;
1124 break;
1125 }
1126 }
1127 }
1128
1129 if (IsVALU) {
1130 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1131
1132 CurDAG->SelectNodeTo(
1133 N, Opc, N->getVTList(),
1134 {N->getOperand(0), N->getOperand(1),
1135 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1136 } else {
1137 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1138
1139 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1140 {N->getOperand(0), N->getOperand(1)});
1141 }
1142}
1143
1144void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1145 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1146 SDValue Ops[10];
1147
1148 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1149 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1150 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1151 Ops[8] = N->getOperand(0);
1152 Ops[9] = N->getOperand(4);
1153
1154 // If there are no source modifiers, prefer fmac over fma because it can use
1155 // the smaller VOP2 encoding.
1156 bool UseFMAC = Subtarget->hasDLInsts() &&
1157 cast<ConstantSDNode>(Ops[0])->isZero() &&
1158 cast<ConstantSDNode>(Ops[2])->isZero() &&
1159 cast<ConstantSDNode>(Ops[4])->isZero();
1160 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1161 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1162}
1163
1164void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1165 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1166 SDValue Ops[8];
1167
1168 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1169 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1170 Ops[6] = N->getOperand(0);
1171 Ops[7] = N->getOperand(3);
1172
1173 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1174}
1175
1176// We need to handle this here because tablegen doesn't support matching
1177// instructions with multiple outputs.
1178void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1179 EVT VT = N->getValueType(0);
1180
1181 assert(VT == MVT::f32 || VT == MVT::f64);
1182
1183 unsigned Opc
1184 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1185
1186 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1187 // omod
1188 SDValue Ops[8];
1189 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1190 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1191 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1192 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1193}
1194
1195// We need to handle this here because tablegen doesn't support matching
1196// instructions with multiple outputs.
1197void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1198 SDLoc SL(N);
1199 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1200 unsigned Opc;
1201 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
1202 if (Subtarget->hasMADIntraFwdBug())
1203 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1204 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1205 else if (UseNoCarry)
1206 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1207 else
1208 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1209
1210 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1211 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1212 Clamp };
1213
1214 if (UseNoCarry) {
1215 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1216 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1217 CurDAG->RemoveDeadNode(N);
1218 return;
1219 }
1220
1221 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1222}
1223
1224// We need to handle this here because tablegen doesn't support matching
1225// instructions with multiple outputs.
1226void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1227 SDLoc SL(N);
1228 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1229 SDVTList VTList;
1230 unsigned Opc;
1231 if (Subtarget->hasMadU64U32NoCarry()) {
1232 VTList = CurDAG->getVTList(MVT::i64);
1233 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1234 } else {
1235 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1236 if (Subtarget->hasMADIntraFwdBug()) {
1237 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1238 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1239 } else {
1240 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1241 }
1242 }
1243
1244 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1245 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1246 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1247 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1248 if (!SDValue(N, 0).use_empty()) {
1249 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1250 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1251 MVT::i32, SDValue(Mad, 0), Sub0);
1252 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1253 }
1254 if (!SDValue(N, 1).use_empty()) {
1255 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1256 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1257 MVT::i32, SDValue(Mad, 0), Sub1);
1258 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1259 }
1260 CurDAG->RemoveDeadNode(N);
1261}
1262
1263bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1264 if (!isUInt<16>(Offset))
1265 return false;
1266
1267 if (!Base || Subtarget->hasUsableDSOffset() ||
1268 Subtarget->unsafeDSOffsetFoldingEnabled())
1269 return true;
1270
1271 // On Southern Islands instruction with a negative base value and an offset
1272 // don't seem to work.
1273 return CurDAG->SignBitIsZero(Base);
1274}
1275
1276bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1277 SDValue &Offset) const {
1278 SDLoc DL(Addr);
1279 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1280 SDValue N0 = Addr.getOperand(0);
1281 SDValue N1 = Addr.getOperand(1);
1282 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1283 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1284 // (add n0, c0)
1285 Base = N0;
1286 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1287 return true;
1288 }
1289 } else if (Addr.getOpcode() == ISD::SUB) {
1290 // sub C, x -> add (sub 0, x), C
1291 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1292 int64_t ByteOffset = C->getSExtValue();
1293 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1294 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1295
1296 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1297 // the known bits in isDSOffsetLegal. We need to emit the selected node
1298 // here, so this is thrown away.
1299 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1300 Zero, Addr.getOperand(1));
1301
1302 if (isDSOffsetLegal(Sub, ByteOffset)) {
1304 Opnds.push_back(Zero);
1305 Opnds.push_back(Addr.getOperand(1));
1306
1307 // FIXME: Select to VOP3 version for with-carry.
1308 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1309 if (Subtarget->hasAddNoCarry()) {
1310 SubOp = AMDGPU::V_SUB_U32_e64;
1311 Opnds.push_back(
1312 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1313 }
1314
1315 MachineSDNode *MachineSub =
1316 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1317
1318 Base = SDValue(MachineSub, 0);
1319 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1320 return true;
1321 }
1322 }
1323 }
1324 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1325 // If we have a constant address, prefer to put the constant into the
1326 // offset. This can save moves to load the constant address since multiple
1327 // operations can share the zero base address register, and enables merging
1328 // into read2 / write2 instructions.
1329
1330 SDLoc DL(Addr);
1331
1332 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1333 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1334 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1335 DL, MVT::i32, Zero);
1336 Base = SDValue(MovZero, 0);
1337 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1338 return true;
1339 }
1340 }
1341
1342 // default case
1343 Base = Addr;
1344 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1345 return true;
1346}
1347
1348bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1349 unsigned Offset1,
1350 unsigned Size) const {
1351 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1352 return false;
1353 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1354 return false;
1355
1356 if (!Base || Subtarget->hasUsableDSOffset() ||
1357 Subtarget->unsafeDSOffsetFoldingEnabled())
1358 return true;
1359
1360 // On Southern Islands instruction with a negative base value and an offset
1361 // don't seem to work.
1362 return CurDAG->SignBitIsZero(Base);
1363}
1364
1365// Return whether the operation has NoUnsignedWrap property.
1366static bool isNoUnsignedWrap(SDValue Addr) {
1367 return (Addr.getOpcode() == ISD::ADD &&
1368 Addr->getFlags().hasNoUnsignedWrap()) ||
1369 Addr->getOpcode() == ISD::OR;
1370}
1371
1372// Check that the base address of flat scratch load/store in the form of `base +
1373// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1374// requirement). We always treat the first operand as the base address here.
1375bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1376 if (isNoUnsignedWrap(Addr))
1377 return true;
1378
1379 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1380 // values.
1381 if (Subtarget->hasSignedScratchOffsets())
1382 return true;
1383
1384 auto LHS = Addr.getOperand(0);
1385 auto RHS = Addr.getOperand(1);
1386
1387 // If the immediate offset is negative and within certain range, the base
1388 // address cannot also be negative. If the base is also negative, the sum
1389 // would be either negative or much larger than the valid range of scratch
1390 // memory a thread can access.
1391 ConstantSDNode *ImmOp = nullptr;
1392 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1393 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1394 return true;
1395 }
1396
1397 return CurDAG->SignBitIsZero(LHS);
1398}
1399
1400// Check address value in SGPR/VGPR are legal for flat scratch in the form
1401// of: SGPR + VGPR.
1402bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1403 if (isNoUnsignedWrap(Addr))
1404 return true;
1405
1406 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1407 // values.
1408 if (Subtarget->hasSignedScratchOffsets())
1409 return true;
1410
1411 auto LHS = Addr.getOperand(0);
1412 auto RHS = Addr.getOperand(1);
1413 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1414}
1415
1416// Check address value in SGPR/VGPR are legal for flat scratch in the form
1417// of: SGPR + VGPR + Imm.
1418bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1419 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1420 // values.
1421 if (AMDGPU::isGFX12Plus(*Subtarget))
1422 return true;
1423
1424 auto Base = Addr.getOperand(0);
1425 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1426 // If the immediate offset is negative and within certain range, the base
1427 // address cannot also be negative. If the base is also negative, the sum
1428 // would be either negative or much larger than the valid range of scratch
1429 // memory a thread can access.
1430 if (isNoUnsignedWrap(Base) &&
1431 (isNoUnsignedWrap(Addr) ||
1432 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1433 return true;
1434
1435 auto LHS = Base.getOperand(0);
1436 auto RHS = Base.getOperand(1);
1437 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1438}
1439
1440// TODO: If offset is too big, put low 16-bit into offset.
1441bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1442 SDValue &Offset0,
1443 SDValue &Offset1) const {
1444 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1445}
1446
1447bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1448 SDValue &Offset0,
1449 SDValue &Offset1) const {
1450 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1451}
1452
1453bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1454 SDValue &Offset0, SDValue &Offset1,
1455 unsigned Size) const {
1456 SDLoc DL(Addr);
1457
1458 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1459 SDValue N0 = Addr.getOperand(0);
1460 SDValue N1 = Addr.getOperand(1);
1461 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1462 unsigned OffsetValue0 = C1->getZExtValue();
1463 unsigned OffsetValue1 = OffsetValue0 + Size;
1464
1465 // (add n0, c0)
1466 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1467 Base = N0;
1468 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1469 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1470 return true;
1471 }
1472 } else if (Addr.getOpcode() == ISD::SUB) {
1473 // sub C, x -> add (sub 0, x), C
1474 if (const ConstantSDNode *C =
1476 unsigned OffsetValue0 = C->getZExtValue();
1477 unsigned OffsetValue1 = OffsetValue0 + Size;
1478
1479 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1480 SDLoc DL(Addr);
1481 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1482
1483 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1484 // the known bits in isDSOffsetLegal. We need to emit the selected node
1485 // here, so this is thrown away.
1486 SDValue Sub =
1487 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1488
1489 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1491 Opnds.push_back(Zero);
1492 Opnds.push_back(Addr.getOperand(1));
1493 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1494 if (Subtarget->hasAddNoCarry()) {
1495 SubOp = AMDGPU::V_SUB_U32_e64;
1496 Opnds.push_back(
1497 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1498 }
1499
1500 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1501 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1502
1503 Base = SDValue(MachineSub, 0);
1504 Offset0 =
1505 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1506 Offset1 =
1507 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1508 return true;
1509 }
1510 }
1511 }
1512 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1513 unsigned OffsetValue0 = CAddr->getZExtValue();
1514 unsigned OffsetValue1 = OffsetValue0 + Size;
1515
1516 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1517 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1518 MachineSDNode *MovZero =
1519 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1520 Base = SDValue(MovZero, 0);
1521 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1522 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1523 return true;
1524 }
1525 }
1526
1527 // default case
1528
1529 Base = Addr;
1530 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1531 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1532 return true;
1533}
1534
1535bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1536 SDValue &SOffset, SDValue &Offset,
1537 SDValue &Offen, SDValue &Idxen,
1538 SDValue &Addr64) const {
1539 // Subtarget prefers to use flat instruction
1540 // FIXME: This should be a pattern predicate and not reach here
1541 if (Subtarget->useFlatForGlobal())
1542 return false;
1543
1544 SDLoc DL(Addr);
1545
1546 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1547 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1548 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1549 SOffset = Subtarget->hasRestrictedSOffset()
1550 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1551 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1552
1553 ConstantSDNode *C1 = nullptr;
1554 SDValue N0 = Addr;
1555 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1556 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1557 if (isUInt<32>(C1->getZExtValue()))
1558 N0 = Addr.getOperand(0);
1559 else
1560 C1 = nullptr;
1561 }
1562
1563 if (N0->isAnyAdd()) {
1564 // (add N2, N3) -> addr64, or
1565 // (add (add N2, N3), C1) -> addr64
1566 SDValue N2 = N0.getOperand(0);
1567 SDValue N3 = N0.getOperand(1);
1568 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1569
1570 if (N2->isDivergent()) {
1571 if (N3->isDivergent()) {
1572 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1573 // addr64, and construct the resource from a 0 address.
1574 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1575 VAddr = N0;
1576 } else {
1577 // N2 is divergent, N3 is not.
1578 Ptr = N3;
1579 VAddr = N2;
1580 }
1581 } else {
1582 // N2 is not divergent.
1583 Ptr = N2;
1584 VAddr = N3;
1585 }
1586 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1587 } else if (N0->isDivergent()) {
1588 // N0 is divergent. Use it as the addr64, and construct the resource from a
1589 // 0 address.
1590 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1591 VAddr = N0;
1592 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1593 } else {
1594 // N0 -> offset, or
1595 // (N0 + C1) -> offset
1596 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1597 Ptr = N0;
1598 }
1599
1600 if (!C1) {
1601 // No offset.
1602 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1603 return true;
1604 }
1605
1606 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1607 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1608 // Legal offset for instruction.
1609 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1610 return true;
1611 }
1612
1613 // Illegal offset, store it in soffset.
1614 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1615 SOffset =
1616 SDValue(CurDAG->getMachineNode(
1617 AMDGPU::S_MOV_B32, DL, MVT::i32,
1618 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1619 0);
1620 return true;
1621}
1622
1623bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1624 SDValue &VAddr, SDValue &SOffset,
1625 SDValue &Offset) const {
1626 SDValue Ptr, Offen, Idxen, Addr64;
1627
1628 // addr64 bit was removed for volcanic islands.
1629 // FIXME: This should be a pattern predicate and not reach here
1630 if (!Subtarget->hasAddr64())
1631 return false;
1632
1633 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1634 return false;
1635
1636 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1637 if (C->getSExtValue()) {
1638 SDLoc DL(Addr);
1639
1640 const SITargetLowering& Lowering =
1641 *static_cast<const SITargetLowering*>(getTargetLowering());
1642
1643 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1644 return true;
1645 }
1646
1647 return false;
1648}
1649
1650std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1651 SDLoc DL(N);
1652
1653 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1654 SDValue TFI =
1655 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1656
1657 // We rebase the base address into an absolute stack address and hence
1658 // use constant 0 for soffset. This value must be retained until
1659 // frame elimination and eliminateFrameIndex will choose the appropriate
1660 // frame register if need be.
1661 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1662}
1663
1664bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1665 SDValue Addr, SDValue &Rsrc,
1666 SDValue &VAddr, SDValue &SOffset,
1667 SDValue &ImmOffset) const {
1668
1669 SDLoc DL(Addr);
1670 MachineFunction &MF = CurDAG->getMachineFunction();
1671 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1672
1673 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1674
1675 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1676 int64_t Imm = CAddr->getSExtValue();
1677 const int64_t NullPtr =
1679 // Don't fold null pointer.
1680 if (Imm != NullPtr) {
1681 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1682 SDValue HighBits =
1683 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1684 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1685 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1686 VAddr = SDValue(MovHighBits, 0);
1687
1688 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1689 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1690 return true;
1691 }
1692 }
1693
1694 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1695 // (add n0, c1)
1696
1697 SDValue N0 = Addr.getOperand(0);
1698 uint64_t C1 = Addr.getConstantOperandVal(1);
1699
1700 // Offsets in vaddr must be positive if range checking is enabled.
1701 //
1702 // The total computation of vaddr + soffset + offset must not overflow. If
1703 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1704 // overflowing.
1705 //
1706 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1707 // always perform a range check. If a negative vaddr base index was used,
1708 // this would fail the range check. The overall address computation would
1709 // compute a valid address, but this doesn't happen due to the range
1710 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1711 //
1712 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1713 // MUBUF vaddr, but not on older subtargets which can only do this if the
1714 // sign bit is known 0.
1715 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1716 if (TII->isLegalMUBUFImmOffset(C1) &&
1717 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1718 CurDAG->SignBitIsZero(N0))) {
1719 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1720 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1721 return true;
1722 }
1723 }
1724
1725 // (node)
1726 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1727 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1728 return true;
1729}
1730
1731static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1732 if (Val.getOpcode() != ISD::CopyFromReg)
1733 return false;
1734 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1735 if (!Reg.isPhysical())
1736 return false;
1737 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1738 return RC && TRI.isSGPRClass(RC);
1739}
1740
1741bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1742 SDValue Addr,
1743 SDValue &SRsrc,
1744 SDValue &SOffset,
1745 SDValue &Offset) const {
1746 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1747 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1748 MachineFunction &MF = CurDAG->getMachineFunction();
1749 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1750 SDLoc DL(Addr);
1751
1752 // CopyFromReg <sgpr>
1753 if (IsCopyFromSGPR(*TRI, Addr)) {
1754 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1755 SOffset = Addr;
1756 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1757 return true;
1758 }
1759
1760 ConstantSDNode *CAddr;
1761 if (Addr.getOpcode() == ISD::ADD) {
1762 // Add (CopyFromReg <sgpr>) <constant>
1763 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1764 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1765 return false;
1766 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1767 return false;
1768
1769 SOffset = Addr.getOperand(0);
1770 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1771 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1772 // <constant>
1773 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1774 } else {
1775 return false;
1776 }
1777
1778 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1779
1780 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1781 return true;
1782}
1783
1784bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1785 SDValue &SOffset, SDValue &Offset
1786 ) const {
1787 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1788 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1789
1790 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1791 return false;
1792
1793 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1794 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1795 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1796 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1797 maskTrailingOnes<uint64_t>(32); // Size
1798 SDLoc DL(Addr);
1799
1800 const SITargetLowering& Lowering =
1801 *static_cast<const SITargetLowering*>(getTargetLowering());
1802
1803 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1804 return true;
1805 }
1806 return false;
1807}
1808
1809bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1810 SDValue &SOffset) const {
1811 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1812 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1813 return true;
1814 }
1815
1816 SOffset = ByteOffsetNode;
1817 return true;
1818}
1819
1820// Find a load or store from corresponding pattern root.
1821// Roots may be build_vector, bitconvert or their combinations.
1824 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1825 return MN;
1827 for (SDValue V : N->op_values())
1828 if (MemSDNode *MN =
1830 return MN;
1831 llvm_unreachable("cannot find MemSDNode in the pattern!");
1832}
1833
1834bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1835 SDValue &VAddr, SDValue &Offset,
1836 uint64_t FlatVariant) const {
1837 int64_t OffsetVal = 0;
1838
1839 unsigned AS = findMemSDNode(N)->getAddressSpace();
1840
1841 bool CanHaveFlatSegmentOffsetBug =
1842 Subtarget->hasFlatSegmentOffsetBug() &&
1843 FlatVariant == SIInstrFlags::FLAT &&
1845
1846 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1847 SDValue N0, N1;
1848 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1849 (FlatVariant != SIInstrFlags::FlatScratch ||
1850 isFlatScratchBaseLegal(Addr))) {
1851 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1852
1853 // Adding the offset to the base address in a FLAT instruction must not
1854 // change the memory aperture in which the address falls. Therefore we can
1855 // only fold offsets from inbounds GEPs into FLAT instructions.
1856 bool IsInBounds =
1857 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1858 if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1859 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1860 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1861 Addr = N0;
1862 OffsetVal = COffsetVal;
1863 } else {
1864 // If the offset doesn't fit, put the low bits into the offset field
1865 // and add the rest.
1866 //
1867 // For a FLAT instruction the hardware decides whether to access
1868 // global/scratch/shared memory based on the high bits of vaddr,
1869 // ignoring the offset field, so we have to ensure that when we add
1870 // remainder to vaddr it still points into the same underlying object.
1871 // The easiest way to do that is to make sure that we split the offset
1872 // into two pieces that are both >= 0 or both <= 0.
1873
1874 SDLoc DL(N);
1875 uint64_t RemainderOffset;
1876
1877 std::tie(OffsetVal, RemainderOffset) =
1878 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1879
1880 SDValue AddOffsetLo =
1881 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1882 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1883
1884 if (Addr.getValueType().getSizeInBits() == 32) {
1886 Opnds.push_back(N0);
1887 Opnds.push_back(AddOffsetLo);
1888 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1889 if (Subtarget->hasAddNoCarry()) {
1890 AddOp = AMDGPU::V_ADD_U32_e64;
1891 Opnds.push_back(Clamp);
1892 }
1893 Addr =
1894 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1895 } else {
1896 // TODO: Should this try to use a scalar add pseudo if the base
1897 // address is uniform and saddr is usable?
1898 SDValue Sub0 =
1899 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1900 SDValue Sub1 =
1901 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1902
1903 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1904 DL, MVT::i32, N0, Sub0);
1905 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1906 DL, MVT::i32, N0, Sub1);
1907
1908 SDValue AddOffsetHi =
1909 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1910
1911 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1912
1913 SDNode *Add =
1914 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1915 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1916
1917 SDNode *Addc = CurDAG->getMachineNode(
1918 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1919 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1920
1921 SDValue RegSequenceArgs[] = {
1922 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1923 MVT::i32),
1924 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1925
1926 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1927 MVT::i64, RegSequenceArgs),
1928 0);
1929 }
1930 }
1931 }
1932 }
1933 }
1934
1935 VAddr = Addr;
1936 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1937 return true;
1938}
1939
1940bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1941 SDValue &VAddr,
1942 SDValue &Offset) const {
1943 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1944}
1945
1946bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1947 SDValue &VAddr,
1948 SDValue &Offset) const {
1949 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1950}
1951
1952bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1953 SDValue &VAddr,
1954 SDValue &Offset) const {
1955 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1957}
1958
1959// If this matches *_extend i32:x, return x
1960// Otherwise if the value is I32 returns x.
1962 const SelectionDAG *DAG) {
1963 if (Op.getValueType() == MVT::i32)
1964 return Op;
1965
1966 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1967 Op.getOpcode() != ISD::ANY_EXTEND &&
1968 !(DAG->SignBitIsZero(Op) &&
1969 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1970 return SDValue();
1971
1972 SDValue ExtSrc = Op.getOperand(0);
1973 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1974}
1975
1976// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1977// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1978bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1979 SDValue &SAddr, SDValue &VOffset,
1980 SDValue &Offset, bool &ScaleOffset,
1981 bool NeedIOffset) const {
1982 int64_t ImmOffset = 0;
1983 ScaleOffset = false;
1984
1985 // Match the immediate offset first, which canonically is moved as low as
1986 // possible.
1987
1988 SDValue LHS, RHS;
1989 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1990 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1991 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1992
1993 if (NeedIOffset &&
1994 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1996 Addr = LHS;
1997 ImmOffset = COffsetVal;
1998 } else if (!LHS->isDivergent()) {
1999 if (COffsetVal > 0) {
2000 SDLoc SL(N);
2001 // saddr + large_offset -> saddr +
2002 // (voffset = large_offset & ~MaxOffset) +
2003 // (large_offset & MaxOffset);
2004 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2005 if (NeedIOffset) {
2006 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2008 }
2009
2010 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2011 : isUInt<32>(RemainderOffset)) {
2012 SDNode *VMov = CurDAG->getMachineNode(
2013 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2014 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2015 VOffset = SDValue(VMov, 0);
2016 SAddr = LHS;
2017 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2018 return true;
2019 }
2020 }
2021
2022 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2023 // is 1 we would need to perform 1 or 2 extra moves for each half of
2024 // the constant and it is better to do a scalar add and then issue a
2025 // single VALU instruction to materialize zero. Otherwise it is less
2026 // instructions to perform VALU adds with immediates or inline literals.
2027 unsigned NumLiterals =
2028 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2029 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2030 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2031 return false;
2032 }
2033 }
2034
2035 // Match the variable offset.
2036 if (Addr->isAnyAdd()) {
2037 LHS = Addr.getOperand(0);
2038
2039 if (!LHS->isDivergent()) {
2040 // add (i64 sgpr), (*_extend (i32 vgpr))
2041 RHS = Addr.getOperand(1);
2042 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2043 if (SDValue ExtRHS = matchExtFromI32orI32(
2044 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2045 SAddr = LHS;
2046 VOffset = ExtRHS;
2047 }
2048 }
2049
2050 RHS = Addr.getOperand(1);
2051 if (!SAddr && !RHS->isDivergent()) {
2052 // add (*_extend (i32 vgpr)), (i64 sgpr)
2053 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2054 if (SDValue ExtLHS = matchExtFromI32orI32(
2055 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2056 SAddr = RHS;
2057 VOffset = ExtLHS;
2058 }
2059 }
2060
2061 if (SAddr) {
2062 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2063 return true;
2064 }
2065 }
2066
2067 if (Subtarget->hasScaleOffset() &&
2068 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2071 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2072 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2073 Addr.getOperand(0)->isDivergent() &&
2075 !Addr.getOperand(2)->isDivergent()) {
2076 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2077 unsigned Size =
2078 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2079 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2080 if (ScaleOffset) {
2081 SAddr = Addr.getOperand(2);
2082 VOffset = Addr.getOperand(0);
2083 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2084 return true;
2085 }
2086 }
2087
2088 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2089 isa<ConstantSDNode>(Addr))
2090 return false;
2091
2092 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2093 // moves required to copy a 64-bit SGPR to VGPR.
2094 SAddr = Addr;
2095 SDNode *VMov =
2096 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2097 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2098 VOffset = SDValue(VMov, 0);
2099 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2100 return true;
2101}
2102
2103bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2104 SDValue &SAddr, SDValue &VOffset,
2105 SDValue &Offset,
2106 SDValue &CPol) const {
2107 bool ScaleOffset;
2108 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2109 return false;
2110
2111 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2112 SDLoc(), MVT::i32);
2113 return true;
2114}
2115
2116bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2117 SDValue &SAddr, SDValue &VOffset,
2118 SDValue &Offset,
2119 SDValue &CPol) const {
2120 bool ScaleOffset;
2121 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2122 return false;
2123
2124 // We are assuming CPol is always the last operand of the intrinsic.
2125 auto PassedCPol =
2126 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2127 CPol = CurDAG->getTargetConstant(
2128 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2129 return true;
2130}
2131
2132bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2133 SDValue &SAddr,
2134 SDValue &VOffset,
2135 SDValue &Offset,
2136 SDValue &CPol) const {
2137 bool ScaleOffset;
2138 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2139 return false;
2140
2141 // We are assuming CPol is second from last operand of the intrinsic.
2142 auto PassedCPol =
2143 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2144 CPol = CurDAG->getTargetConstant(
2145 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2146 return true;
2147}
2148
2149bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2150 SDValue &SAddr, SDValue &VOffset,
2151 SDValue &Offset,
2152 SDValue &CPol) const {
2153 bool ScaleOffset;
2154 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2155 return false;
2156
2157 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2158 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2159 return true;
2160}
2161
2162bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2163 SDValue &SAddr,
2164 SDValue &VOffset,
2165 SDValue &CPol) const {
2166 bool ScaleOffset;
2167 SDValue DummyOffset;
2168 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2169 false))
2170 return false;
2171
2172 // We are assuming CPol is always the last operand of the intrinsic.
2173 auto PassedCPol =
2174 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2175 CPol = CurDAG->getTargetConstant(
2176 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2177 return true;
2178}
2179
2180bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2181 SDValue &SAddr,
2182 SDValue &VOffset,
2183 SDValue &CPol) const {
2184 bool ScaleOffset;
2185 SDValue DummyOffset;
2186 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2187 false))
2188 return false;
2189
2190 // We are assuming CPol is second from last operand of the intrinsic.
2191 auto PassedCPol =
2192 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2193 CPol = CurDAG->getTargetConstant(
2194 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2195 return true;
2196}
2197
2199 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2200 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2201 } else if (SAddr.getOpcode() == ISD::ADD &&
2203 // Materialize this into a scalar move for scalar address to avoid
2204 // readfirstlane.
2205 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2206 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2207 FI->getValueType(0));
2208 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2209 MVT::i32, TFI, SAddr.getOperand(1)),
2210 0);
2211 }
2212
2213 return SAddr;
2214}
2215
2216// Match (32-bit SGPR base) + sext(imm offset)
2217bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2218 SDValue &SAddr,
2219 SDValue &Offset) const {
2220 if (Addr->isDivergent())
2221 return false;
2222
2223 SDLoc DL(Addr);
2224
2225 int64_t COffsetVal = 0;
2226
2227 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2228 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2229 SAddr = Addr.getOperand(0);
2230 } else {
2231 SAddr = Addr;
2232 }
2233
2234 SAddr = SelectSAddrFI(CurDAG, SAddr);
2235
2236 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2237
2238 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2240 int64_t SplitImmOffset, RemainderOffset;
2241 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2243
2244 COffsetVal = SplitImmOffset;
2245
2246 SDValue AddOffset =
2248 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2249 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2250 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2251 SAddr, AddOffset),
2252 0);
2253 }
2254
2255 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2256
2257 return true;
2258}
2259
2260// Check whether the flat scratch SVS swizzle bug affects this access.
2261bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2262 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2263 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2264 return false;
2265
2266 // The bug affects the swizzling of SVS accesses if there is any carry out
2267 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2268 // voffset to (soffset + inst_offset).
2269 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2270 KnownBits SKnown =
2271 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2272 KnownBits::makeConstant(APInt(32, ImmOffset,
2273 /*isSigned=*/true)));
2274 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2275 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2276 return (VMax & 3) + (SMax & 3) >= 4;
2277}
2278
2279bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2280 SDValue &VAddr, SDValue &SAddr,
2281 SDValue &Offset,
2282 SDValue &CPol) const {
2283 int64_t ImmOffset = 0;
2284
2285 SDValue LHS, RHS;
2286 SDValue OrigAddr = Addr;
2287 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2288 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2289 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2290
2291 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2293 Addr = LHS;
2294 ImmOffset = COffsetVal;
2295 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2296 SDLoc SL(N);
2297 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2298 // (large_offset & MaxOffset);
2299 int64_t SplitImmOffset, RemainderOffset;
2300 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2302
2303 if (isUInt<32>(RemainderOffset)) {
2304 SDNode *VMov = CurDAG->getMachineNode(
2305 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2306 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2307 VAddr = SDValue(VMov, 0);
2308 SAddr = LHS;
2309 if (!isFlatScratchBaseLegal(Addr))
2310 return false;
2311 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2312 return false;
2313 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2314 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2315 return true;
2316 }
2317 }
2318 }
2319
2320 if (Addr.getOpcode() != ISD::ADD)
2321 return false;
2322
2323 LHS = Addr.getOperand(0);
2324 RHS = Addr.getOperand(1);
2325
2326 if (!LHS->isDivergent() && RHS->isDivergent()) {
2327 SAddr = LHS;
2328 VAddr = RHS;
2329 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2330 SAddr = RHS;
2331 VAddr = LHS;
2332 } else {
2333 return false;
2334 }
2335
2336 if (OrigAddr != Addr) {
2337 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2338 return false;
2339 } else {
2340 if (!isFlatScratchBaseLegalSV(OrigAddr))
2341 return false;
2342 }
2343
2344 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2345 return false;
2346 SAddr = SelectSAddrFI(CurDAG, SAddr);
2347 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2348
2349 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2350 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2351 SDLoc(), MVT::i32);
2352 return true;
2353}
2354
2355// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2356// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2357// Handle the case where the Immediate Offset + SOffset is negative.
2358bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2359 bool Imm32Only,
2360 bool IsBuffer,
2361 int64_t ImmOffset) const {
2362 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2363 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2364 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2365 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2366 return false;
2367 }
2368
2369 return true;
2370}
2371
2372// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2373// the load byte size. If it is update \p Offset to a pre-scaled value and
2374// return true.
2375bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2376 bool IsSigned) const {
2377 bool ScaleOffset = false;
2378 if (!Subtarget->hasScaleOffset() || !Offset)
2379 return false;
2380
2381 unsigned Size =
2382 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2383
2384 SDValue Off = Offset;
2385 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2386 Off = Ext;
2387
2388 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2389 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2390 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2391 } else if (Offset.getOpcode() == ISD::MUL ||
2392 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2393 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2394 (Offset.isMachineOpcode() &&
2395 Offset.getMachineOpcode() ==
2396 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2397 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2398 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2399 ScaleOffset = C->getZExtValue() == Size;
2400 }
2401
2402 if (ScaleOffset)
2403 Offset = Off.getOperand(0);
2404
2405 return ScaleOffset;
2406}
2407
2408// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2409// not null) offset. If Imm32Only is true, match only 32-bit immediate
2410// offsets available on CI.
2411bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2412 SDValue *SOffset, SDValue *Offset,
2413 bool Imm32Only, bool IsBuffer,
2414 bool HasSOffset, int64_t ImmOffset,
2415 bool *ScaleOffset) const {
2416 assert((!SOffset || !Offset) &&
2417 "Cannot match both soffset and offset at the same time!");
2418
2419 if (ScaleOffset) {
2420 assert(N && SOffset);
2421
2422 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2423 }
2424
2425 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2426 if (!C) {
2427 if (!SOffset)
2428 return false;
2429
2430 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2431 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2432 *SOffset = ByteOffsetNode;
2433 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2434 ImmOffset);
2435 }
2436 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2437 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2438 *SOffset = ByteOffsetNode.getOperand(0);
2439 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2440 ImmOffset);
2441 }
2442 }
2443 return false;
2444 }
2445
2446 SDLoc SL(ByteOffsetNode);
2447
2448 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2449 // offset for S_BUFFER instructions is unsigned.
2450 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2451 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2452 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2453 if (EncodedOffset && Offset && !Imm32Only) {
2454 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2455 return true;
2456 }
2457
2458 // SGPR and literal offsets are unsigned.
2459 if (ByteOffset < 0)
2460 return false;
2461
2462 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2463 if (EncodedOffset && Offset && Imm32Only) {
2464 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2465 return true;
2466 }
2467
2468 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2469 return false;
2470
2471 if (SOffset) {
2472 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2473 *SOffset = SDValue(
2474 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2475 return true;
2476 }
2477
2478 return false;
2479}
2480
2481SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2482 if (Addr.getValueType() != MVT::i32)
2483 return Addr;
2484
2485 // Zero-extend a 32-bit address.
2486 SDLoc SL(Addr);
2487
2488 const MachineFunction &MF = CurDAG->getMachineFunction();
2489 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2490 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2491 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2492
2493 const SDValue Ops[] = {
2494 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2495 Addr,
2496 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2497 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2498 0),
2499 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2500 };
2501
2502 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2503 Ops), 0);
2504}
2505
2506// Match a base and an immediate (if Offset is not null) or an SGPR (if
2507// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2508// true, match only 32-bit immediate offsets available on CI.
2509bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2510 SDValue &SBase, SDValue *SOffset,
2511 SDValue *Offset, bool Imm32Only,
2512 bool IsBuffer, bool HasSOffset,
2513 int64_t ImmOffset,
2514 bool *ScaleOffset) const {
2515 if (SOffset && Offset) {
2516 assert(!Imm32Only && !IsBuffer);
2517 SDValue B;
2518
2519 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2520 return false;
2521
2522 int64_t ImmOff = 0;
2523 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2524 ImmOff = C->getSExtValue();
2525
2526 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2527 true, ImmOff, ScaleOffset);
2528 }
2529
2530 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2531 // wraparound, because s_load instructions perform the addition in 64 bits.
2532 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2533 !Addr->getFlags().hasNoUnsignedWrap())
2534 return false;
2535
2536 SDValue N0, N1;
2537 // Extract the base and offset if possible.
2538 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2539 N0 = Addr.getOperand(0);
2540 N1 = Addr.getOperand(1);
2541 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2542 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2543 }
2544 if (!N0 || !N1)
2545 return false;
2546
2547 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2548 ImmOffset, ScaleOffset)) {
2549 SBase = N0;
2550 return true;
2551 }
2552 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2553 ImmOffset, ScaleOffset)) {
2554 SBase = N1;
2555 return true;
2556 }
2557 return false;
2558}
2559
2560bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2561 SDValue *SOffset, SDValue *Offset,
2562 bool Imm32Only, bool *ScaleOffset) const {
2563 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2564 /* IsBuffer */ false, /* HasSOffset */ false,
2565 /* ImmOffset */ 0, ScaleOffset)) {
2566 SBase = Expand32BitAddress(SBase);
2567 return true;
2568 }
2569
2570 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2571 SBase = Expand32BitAddress(Addr);
2572 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2573 return true;
2574 }
2575
2576 return false;
2577}
2578
2579bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2580 SDValue &Offset) const {
2581 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2582 &Offset);
2583}
2584
2585bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2586 SDValue &Offset) const {
2587 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2588 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2589 &Offset, /* Imm32Only */ true);
2590}
2591
2592bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2593 SDValue &SOffset, SDValue &CPol) const {
2594 bool ScaleOffset;
2595 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2596 /* Imm32Only */ false, &ScaleOffset))
2597 return false;
2598
2599 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2600 SDLoc(N), MVT::i32);
2601 return true;
2602}
2603
2604bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2605 SDValue &SBase, SDValue &SOffset,
2606 SDValue &Offset,
2607 SDValue &CPol) const {
2608 bool ScaleOffset;
2609 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2610 return false;
2611
2612 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2613 SDLoc(N), MVT::i32);
2614 return true;
2615}
2616
2617bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2618 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2619 /* Imm32Only */ false, /* IsBuffer */ true);
2620}
2621
2622bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2623 SDValue &Offset) const {
2624 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2625 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2626 /* Imm32Only */ true, /* IsBuffer */ true);
2627}
2628
2629bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2630 SDValue &Offset) const {
2631 // Match the (soffset + offset) pair as a 32-bit register base and
2632 // an immediate offset.
2633 return N.getValueType() == MVT::i32 &&
2634 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2635 /* SOffset*/ nullptr, &Offset,
2636 /* Imm32Only */ false, /* IsBuffer */ true);
2637}
2638
2639bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2640 SDValue &Base,
2641 SDValue &Offset) const {
2642 SDLoc DL(Index);
2643
2644 if (CurDAG->isBaseWithConstantOffset(Index)) {
2645 SDValue N0 = Index.getOperand(0);
2646 SDValue N1 = Index.getOperand(1);
2647 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2648
2649 // (add n0, c0)
2650 // Don't peel off the offset (c0) if doing so could possibly lead
2651 // the base (n0) to be negative.
2652 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2653 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2654 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2655 Base = N0;
2656 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2657 return true;
2658 }
2659 }
2660
2661 if (isa<ConstantSDNode>(Index))
2662 return false;
2663
2664 Base = Index;
2665 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2666 return true;
2667}
2668
2669SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2670 SDValue Val, uint32_t Offset,
2671 uint32_t Width) {
2672 if (Val->isDivergent()) {
2673 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2674 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2675 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2676
2677 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2678 }
2679 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2680 // Transformation function, pack the offset and width of a BFE into
2681 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2682 // source, bits [5:0] contain the offset and bits [22:16] the width.
2683 uint32_t PackedVal = Offset | (Width << 16);
2684 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2685
2686 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2687}
2688
2689void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2690 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2691 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2692 // Predicate: 0 < b <= c < 32
2693
2694 const SDValue &Shl = N->getOperand(0);
2695 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2696 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2697
2698 if (B && C) {
2699 uint32_t BVal = B->getZExtValue();
2700 uint32_t CVal = C->getZExtValue();
2701
2702 if (0 < BVal && BVal <= CVal && CVal < 32) {
2703 bool Signed = N->getOpcode() == ISD::SRA;
2704 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2705 32 - CVal));
2706 return;
2707 }
2708 }
2709 SelectCode(N);
2710}
2711
2712void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2713 switch (N->getOpcode()) {
2714 case ISD::AND:
2715 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2716 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2717 // Predicate: isMask(mask)
2718 const SDValue &Srl = N->getOperand(0);
2719 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2720 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2721
2722 if (Shift && Mask) {
2723 uint32_t ShiftVal = Shift->getZExtValue();
2724 uint32_t MaskVal = Mask->getZExtValue();
2725
2726 if (isMask_32(MaskVal)) {
2727 uint32_t WidthVal = llvm::popcount(MaskVal);
2728 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2729 WidthVal));
2730 return;
2731 }
2732 }
2733 }
2734 break;
2735 case ISD::SRL:
2736 if (N->getOperand(0).getOpcode() == ISD::AND) {
2737 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2738 // Predicate: isMask(mask >> b)
2739 const SDValue &And = N->getOperand(0);
2740 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2741 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2742
2743 if (Shift && Mask) {
2744 uint32_t ShiftVal = Shift->getZExtValue();
2745 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2746
2747 if (isMask_32(MaskVal)) {
2748 uint32_t WidthVal = llvm::popcount(MaskVal);
2749 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2750 WidthVal));
2751 return;
2752 }
2753 }
2754 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2755 SelectS_BFEFromShifts(N);
2756 return;
2757 }
2758 break;
2759 case ISD::SRA:
2760 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2761 SelectS_BFEFromShifts(N);
2762 return;
2763 }
2764 break;
2765
2767 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2768 SDValue Src = N->getOperand(0);
2769 if (Src.getOpcode() != ISD::SRL)
2770 break;
2771
2772 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2773 if (!Amt)
2774 break;
2775
2776 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2777 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2778 Amt->getZExtValue(), Width));
2779 return;
2780 }
2781 }
2782
2783 SelectCode(N);
2784}
2785
2786bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2787 assert(N->getOpcode() == ISD::BRCOND);
2788 if (!N->hasOneUse())
2789 return false;
2790
2791 SDValue Cond = N->getOperand(1);
2792 if (Cond.getOpcode() == ISD::CopyToReg)
2793 Cond = Cond.getOperand(2);
2794
2795 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2796 return false;
2797
2798 MVT VT = Cond.getOperand(0).getSimpleValueType();
2799 if (VT == MVT::i32)
2800 return true;
2801
2802 if (VT == MVT::i64) {
2803 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2804 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2805 Subtarget->hasScalarCompareEq64();
2806 }
2807
2808 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2809 return true;
2810
2811 return false;
2812}
2813
2814static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2815 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2816 // Special case for amdgcn.ballot:
2817 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2818 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2819 // =>
2820 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2821 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2822 // Cond becomes a i(WaveSize) full mask value.
2823 // Note that ballot doesn't use SETEQ condition but its easy to support it
2824 // here for completeness, so in this case Negate is set true on return.
2825 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2826 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2827 isNullConstant(VCMP.getOperand(1))) {
2828
2829 auto Cond = VCMP.getOperand(0);
2830 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2831 Cond = Cond.getOperand(0);
2832
2833 if (isBoolSGPR(Cond)) {
2834 Negate = VCMP_CC == ISD::SETEQ;
2835 return Cond;
2836 }
2837 }
2838 return SDValue();
2839}
2840
2841void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2842 SDValue Cond = N->getOperand(1);
2843
2844 if (Cond.isUndef()) {
2845 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2846 N->getOperand(2), N->getOperand(0));
2847 return;
2848 }
2849
2850 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2851
2852 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2853 bool AndExec = !UseSCCBr;
2854 bool Negate = false;
2855
2856 if (Cond.getOpcode() == ISD::SETCC &&
2857 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2858 SDValue VCMP = Cond->getOperand(0);
2859 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2860 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2861 isNullConstant(Cond->getOperand(1)) &&
2862 // We may encounter ballot.i64 in wave32 mode on -O0.
2863 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2864 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2865 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2866 // BRCOND i1 %C, %BB
2867 // =>
2868 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2869 // VCC = COPY i(WaveSize) %VCMP
2870 // S_CBRANCH_VCCNZ/VCCZ %BB
2871 Negate = CC == ISD::SETEQ;
2872 bool NegatedBallot = false;
2873 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2874 Cond = BallotCond;
2875 UseSCCBr = !BallotCond->isDivergent();
2876 Negate = Negate ^ NegatedBallot;
2877 } else {
2878 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2879 // selected as V_CMP, but this may change for uniform condition.
2880 Cond = VCMP;
2881 UseSCCBr = false;
2882 }
2883 }
2884 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2885 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2886 // used.
2887 AndExec = false;
2888 }
2889
2890 unsigned BrOp =
2891 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2892 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2893 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2894 SDLoc SL(N);
2895
2896 if (AndExec) {
2897 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2898 // analyzed what generates the vcc value, so we do not know whether vcc
2899 // bits for disabled lanes are 0. Thus we need to mask out bits for
2900 // disabled lanes.
2901 //
2902 // For the case that we select S_CBRANCH_SCC1 and it gets
2903 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2904 // SIInstrInfo::moveToVALU which inserts the S_AND).
2905 //
2906 // We could add an analysis of what generates the vcc value here and omit
2907 // the S_AND when is unnecessary. But it would be better to add a separate
2908 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2909 // catches both cases.
2910 Cond = SDValue(
2911 CurDAG->getMachineNode(
2912 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2913 MVT::i1,
2914 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2915 : AMDGPU::EXEC,
2916 MVT::i1),
2917 Cond),
2918 0);
2919 }
2920
2921 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2922 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2923 N->getOperand(2), // Basic Block
2924 VCC.getValue(0));
2925}
2926
2927void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2928 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2929 !N->isDivergent()) {
2930 SDValue Src = N->getOperand(0);
2931 if (Src.getValueType() == MVT::f16) {
2932 if (isExtractHiElt(Src, Src)) {
2933 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2934 {Src});
2935 return;
2936 }
2937 }
2938 }
2939
2940 SelectCode(N);
2941}
2942
2943void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2944 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2945 // be copied to an SGPR with readfirstlane.
2946 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2947 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2948
2949 SDValue Chain = N->getOperand(0);
2950 SDValue Ptr = N->getOperand(2);
2951 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2952 MachineMemOperand *MMO = M->getMemOperand();
2953 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2954
2956 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2957 SDValue PtrBase = Ptr.getOperand(0);
2958 SDValue PtrOffset = Ptr.getOperand(1);
2959
2960 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2961 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2962 N = glueCopyToM0(N, PtrBase);
2963 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2964 }
2965 }
2966
2967 if (!Offset) {
2968 N = glueCopyToM0(N, Ptr);
2969 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2970 }
2971
2972 SDValue Ops[] = {
2973 Offset,
2974 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2975 Chain,
2976 N->getOperand(N->getNumOperands() - 1) // New glue
2977 };
2978
2979 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2980 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2981}
2982
2983// We need to handle this here because tablegen doesn't support matching
2984// instructions with multiple outputs.
2985void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2986 unsigned Opc;
2987 switch (IntrID) {
2988 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2989 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2990 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2991 break;
2992 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2993 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2994 break;
2995 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2996 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2997 break;
2998 }
2999 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
3000 N->getOperand(5), N->getOperand(0)};
3001
3002 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3003 MachineMemOperand *MMO = M->getMemOperand();
3004 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3005 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3006}
3007
3008static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3009 switch (IntrID) {
3010 case Intrinsic::amdgcn_ds_gws_init:
3011 return AMDGPU::DS_GWS_INIT;
3012 case Intrinsic::amdgcn_ds_gws_barrier:
3013 return AMDGPU::DS_GWS_BARRIER;
3014 case Intrinsic::amdgcn_ds_gws_sema_v:
3015 return AMDGPU::DS_GWS_SEMA_V;
3016 case Intrinsic::amdgcn_ds_gws_sema_br:
3017 return AMDGPU::DS_GWS_SEMA_BR;
3018 case Intrinsic::amdgcn_ds_gws_sema_p:
3019 return AMDGPU::DS_GWS_SEMA_P;
3020 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3021 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3022 default:
3023 llvm_unreachable("not a gws intrinsic");
3024 }
3025}
3026
3027void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3028 if (!Subtarget->hasGWS() ||
3029 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3030 !Subtarget->hasGWSSemaReleaseAll())) {
3031 // Let this error.
3032 SelectCode(N);
3033 return;
3034 }
3035
3036 // Chain, intrinsic ID, vsrc, offset
3037 const bool HasVSrc = N->getNumOperands() == 4;
3038 assert(HasVSrc || N->getNumOperands() == 3);
3039
3040 SDLoc SL(N);
3041 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3042 int ImmOffset = 0;
3043 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3044 MachineMemOperand *MMO = M->getMemOperand();
3045
3046 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3047 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3048
3049 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3050 // offset field) % 64. Some versions of the programming guide omit the m0
3051 // part, or claim it's from offset 0.
3052 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3053 // If we have a constant offset, try to use the 0 in m0 as the base.
3054 // TODO: Look into changing the default m0 initialization value. If the
3055 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3056 // the immediate offset.
3057 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3058 ImmOffset = ConstOffset->getZExtValue();
3059 } else {
3060 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3061 ImmOffset = BaseOffset.getConstantOperandVal(1);
3062 BaseOffset = BaseOffset.getOperand(0);
3063 }
3064
3065 // Prefer to do the shift in an SGPR since it should be possible to use m0
3066 // as the result directly. If it's already an SGPR, it will be eliminated
3067 // later.
3068 SDNode *SGPROffset
3069 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3070 BaseOffset);
3071 // Shift to offset in m0
3072 SDNode *M0Base
3073 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3074 SDValue(SGPROffset, 0),
3075 CurDAG->getTargetConstant(16, SL, MVT::i32));
3076 glueCopyToM0(N, SDValue(M0Base, 0));
3077 }
3078
3079 SDValue Chain = N->getOperand(0);
3080 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3081
3082 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3083
3084 const MCInstrDesc &InstrDesc = TII->get(Opc);
3085 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3086
3087 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3088
3090 if (HasVSrc) {
3091 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3092
3093 SDValue Data = N->getOperand(2);
3094 MVT DataVT = Data.getValueType().getSimpleVT();
3095 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3096 // Normal 32-bit case.
3097 Ops.push_back(N->getOperand(2));
3098 } else {
3099 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3100 // even aligned 64-bit register class.
3101 const SDValue RegSeqOps[] = {
3102 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3103 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3104 SDValue(
3105 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3106 0),
3107 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3108
3109 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3110 SL, MVT::v2i32, RegSeqOps),
3111 0));
3112 }
3113 }
3114
3115 Ops.push_back(OffsetField);
3116 Ops.push_back(Chain);
3117
3118 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3119 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3120}
3121
3122void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3123 if (Subtarget->getLDSBankCount() != 16) {
3124 // This is a single instruction with a pattern.
3125 SelectCode(N);
3126 return;
3127 }
3128
3129 SDLoc DL(N);
3130
3131 // This requires 2 instructions. It is possible to write a pattern to support
3132 // this, but the generated isel emitter doesn't correctly deal with multiple
3133 // output instructions using the same physical register input. The copy to m0
3134 // is incorrectly placed before the second instruction.
3135 //
3136 // TODO: Match source modifiers.
3137 //
3138 // def : Pat <
3139 // (int_amdgcn_interp_p1_f16
3140 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3141 // (i32 timm:$attrchan), (i32 timm:$attr),
3142 // (i1 timm:$high), M0),
3143 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3144 // timm:$attrchan, 0,
3145 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3146 // let Predicates = [has16BankLDS];
3147 // }
3148
3149 // 16 bank LDS
3150 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3151 N->getOperand(5), SDValue());
3152
3153 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3154
3155 SDNode *InterpMov =
3156 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3157 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3158 N->getOperand(3), // Attr
3159 N->getOperand(2), // Attrchan
3160 ToM0.getValue(1) // In glue
3161 });
3162
3163 SDNode *InterpP1LV =
3164 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3165 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3166 N->getOperand(1), // Src0
3167 N->getOperand(3), // Attr
3168 N->getOperand(2), // Attrchan
3169 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3170 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3171 N->getOperand(4), // high
3172 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3173 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3174 SDValue(InterpMov, 1)
3175 });
3176
3177 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3178}
3179
3180void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3181 unsigned IntrID = N->getConstantOperandVal(1);
3182 switch (IntrID) {
3183 case Intrinsic::amdgcn_ds_append:
3184 case Intrinsic::amdgcn_ds_consume: {
3185 if (N->getValueType(0) != MVT::i32)
3186 break;
3187 SelectDSAppendConsume(N, IntrID);
3188 return;
3189 }
3190 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3191 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3192 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3193 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3194 SelectDSBvhStackIntrinsic(N, IntrID);
3195 return;
3196 case Intrinsic::amdgcn_init_whole_wave:
3197 CurDAG->getMachineFunction()
3198 .getInfo<SIMachineFunctionInfo>()
3199 ->setInitWholeWave();
3200 break;
3201 }
3202
3203 SelectCode(N);
3204}
3205
3206void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3207 unsigned IntrID = N->getConstantOperandVal(0);
3208 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3209 SDNode *ConvGlueNode = N->getGluedNode();
3210 if (ConvGlueNode) {
3211 // FIXME: Possibly iterate over multiple glue nodes?
3212 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3213 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3214 ConvGlueNode =
3215 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3216 MVT::Glue, SDValue(ConvGlueNode, 0));
3217 } else {
3218 ConvGlueNode = nullptr;
3219 }
3220 switch (IntrID) {
3221 case Intrinsic::amdgcn_wqm:
3222 Opcode = AMDGPU::WQM;
3223 break;
3224 case Intrinsic::amdgcn_softwqm:
3225 Opcode = AMDGPU::SOFT_WQM;
3226 break;
3227 case Intrinsic::amdgcn_wwm:
3228 case Intrinsic::amdgcn_strict_wwm:
3229 Opcode = AMDGPU::STRICT_WWM;
3230 break;
3231 case Intrinsic::amdgcn_strict_wqm:
3232 Opcode = AMDGPU::STRICT_WQM;
3233 break;
3234 case Intrinsic::amdgcn_interp_p1_f16:
3235 SelectInterpP1F16(N);
3236 return;
3237 case Intrinsic::amdgcn_permlane16_swap:
3238 case Intrinsic::amdgcn_permlane32_swap: {
3239 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3240 !Subtarget->hasPermlane16Swap()) ||
3241 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3242 !Subtarget->hasPermlane32Swap())) {
3243 SelectCode(N); // Hit the default error
3244 return;
3245 }
3246
3247 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3248 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3249 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3250
3251 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3252 if (ConvGlueNode)
3253 NewOps.push_back(SDValue(ConvGlueNode, 0));
3254
3255 bool FI = N->getConstantOperandVal(3);
3256 NewOps[2] = CurDAG->getTargetConstant(
3257 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3258
3259 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3260 return;
3261 }
3262 default:
3263 SelectCode(N);
3264 break;
3265 }
3266
3267 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3268 SDValue Src = N->getOperand(1);
3269 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3270 }
3271
3272 if (ConvGlueNode) {
3273 SmallVector<SDValue, 4> NewOps(N->ops());
3274 NewOps.push_back(SDValue(ConvGlueNode, 0));
3275 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3276 }
3277}
3278
3279void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3280 unsigned IntrID = N->getConstantOperandVal(1);
3281 switch (IntrID) {
3282 case Intrinsic::amdgcn_ds_gws_init:
3283 case Intrinsic::amdgcn_ds_gws_barrier:
3284 case Intrinsic::amdgcn_ds_gws_sema_v:
3285 case Intrinsic::amdgcn_ds_gws_sema_br:
3286 case Intrinsic::amdgcn_ds_gws_sema_p:
3287 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3288 SelectDS_GWS(N, IntrID);
3289 return;
3290 default:
3291 break;
3292 }
3293
3294 SelectCode(N);
3295}
3296
3297void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3298 SDValue Log2WaveSize =
3299 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3300 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3301 {N->getOperand(0), Log2WaveSize});
3302}
3303
3304void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3305 SDValue SrcVal = N->getOperand(1);
3306 if (SrcVal.getValueType() != MVT::i32) {
3307 SelectCode(N); // Emit default error
3308 return;
3309 }
3310
3311 SDValue CopyVal;
3312 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3313 SDLoc SL(N);
3314
3315 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3316 CopyVal = SrcVal.getOperand(0);
3317 } else {
3318 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3319 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3320
3321 if (N->isDivergent()) {
3322 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3323 MVT::i32, SrcVal),
3324 0);
3325 }
3326
3327 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3328 {SrcVal, Log2WaveSize}),
3329 0);
3330 }
3331
3332 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3333 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3334}
3335
3336bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3337 unsigned &Mods,
3338 bool IsCanonicalizing,
3339 bool AllowAbs) const {
3340 Mods = SISrcMods::NONE;
3341 Src = In;
3342
3343 if (Src.getOpcode() == ISD::FNEG) {
3344 Mods |= SISrcMods::NEG;
3345 Src = Src.getOperand(0);
3346 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3347 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3348 // denormal mode, but we're implicitly canonicalizing in a source operand.
3349 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3350 if (LHS && LHS->isZero()) {
3351 Mods |= SISrcMods::NEG;
3352 Src = Src.getOperand(1);
3353 }
3354 }
3355
3356 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3357 Mods |= SISrcMods::ABS;
3358 Src = Src.getOperand(0);
3359 }
3360
3361 if (Mods != SISrcMods::NONE)
3362 return true;
3363
3364 // Convert various sign-bit masks on integers to src mods. Currently disabled
3365 // for 16-bit types as the codegen replaces the operand without adding a
3366 // srcmod. This is intentionally finding the cases where we are performing
3367 // float neg and abs on int types, the goal is not to obtain two's complement
3368 // neg or abs. Limit converison to select operands via the nonCanonalizing
3369 // pattern.
3370 // TODO: Add 16-bit support.
3371 if (IsCanonicalizing)
3372 return true;
3373
3374 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3375 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3376 // through the extract to the bitwise op.
3377 SDValue PeekSrc =
3378 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3379 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3380 // types as the codegen replaces the operand without adding a srcmod.
3381 // This is intentionally finding the cases where we are performing float neg
3382 // and abs on int types, the goal is not to obtain two's complement neg or
3383 // abs.
3384 // TODO: Add 16-bit support.
3385 unsigned Opc = PeekSrc.getOpcode();
3386 EVT VT = Src.getValueType();
3387 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3388 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3389 return true;
3390
3391 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3392 if (!CRHS)
3393 return true;
3394
3395 auto ReplaceSrc = [&]() -> SDValue {
3396 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3397 return Src.getOperand(0);
3398
3399 SDValue LHS = PeekSrc->getOperand(0);
3400 SDValue Index = Src->getOperand(1);
3401 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3402 Src.getValueType(), LHS, Index);
3403 };
3404
3405 // Recognise Srcmods:
3406 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3407 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3408 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3409 // SrcModifiers.
3410 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3411 Mods |= SISrcMods::NEG;
3412 Src = ReplaceSrc();
3413 } else if (Opc == ISD::AND && AllowAbs &&
3414 CRHS->getAPIntValue().isMaxSignedValue()) {
3415 Mods |= SISrcMods::ABS;
3416 Src = ReplaceSrc();
3417 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3419 Src = ReplaceSrc();
3420 }
3421
3422 return true;
3423}
3424
3425bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3426 SDValue &SrcMods) const {
3427 unsigned Mods;
3428 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3429 /*AllowAbs=*/true)) {
3430 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3431 return true;
3432 }
3433
3434 return false;
3435}
3436
3437bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3438 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3439 unsigned Mods;
3440 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3441 /*AllowAbs=*/true)) {
3442 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3443 return true;
3444 }
3445
3446 return false;
3447}
3448
3449bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3450 SDValue &SrcMods) const {
3451 unsigned Mods;
3452 if (SelectVOP3ModsImpl(In, Src, Mods,
3453 /*IsCanonicalizing=*/true,
3454 /*AllowAbs=*/false)) {
3455 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3456 return true;
3457 }
3458
3459 return false;
3460}
3461
3462bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3463 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3464 return false;
3465
3466 Src = In;
3467 return true;
3468}
3469
3470bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3471 SDValue &SrcMods,
3472 bool OpSel) const {
3473 unsigned Mods;
3474 if (SelectVOP3ModsImpl(In, Src, Mods,
3475 /*IsCanonicalizing=*/true,
3476 /*AllowAbs=*/false)) {
3477 if (OpSel)
3478 Mods |= SISrcMods::OP_SEL_0;
3479 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3480 return true;
3481 }
3482
3483 return false;
3484}
3485
3486bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3487 SDValue &SrcMods) const {
3488 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3489}
3490
3491bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3492 SDValue &SrcMods) const {
3493 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3494}
3495
3496bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3497 SDValue &SrcMods, SDValue &Clamp,
3498 SDValue &Omod) const {
3499 SDLoc DL(In);
3500 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3501 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3502
3503 return SelectVOP3Mods(In, Src, SrcMods);
3504}
3505
3506bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3507 SDValue &SrcMods, SDValue &Clamp,
3508 SDValue &Omod) const {
3509 SDLoc DL(In);
3510 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3511 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3512
3513 return SelectVOP3BMods(In, Src, SrcMods);
3514}
3515
3516bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3517 SDValue &Clamp, SDValue &Omod) const {
3518 Src = In;
3519
3520 SDLoc DL(In);
3521 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3522 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3523
3524 return true;
3525}
3526
3527bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3528 SDValue &SrcMods, bool IsDOT) const {
3529 unsigned Mods = SISrcMods::NONE;
3530 Src = In;
3531
3532 // TODO: Handle G_FSUB 0 as fneg
3533 if (Src.getOpcode() == ISD::FNEG) {
3535 Src = Src.getOperand(0);
3536 }
3537
3538 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3539 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3540 unsigned VecMods = Mods;
3541
3542 SDValue Lo = stripBitcast(Src.getOperand(0));
3543 SDValue Hi = stripBitcast(Src.getOperand(1));
3544
3545 if (Lo.getOpcode() == ISD::FNEG) {
3546 Lo = stripBitcast(Lo.getOperand(0));
3547 Mods ^= SISrcMods::NEG;
3548 }
3549
3550 if (Hi.getOpcode() == ISD::FNEG) {
3551 Hi = stripBitcast(Hi.getOperand(0));
3552 Mods ^= SISrcMods::NEG_HI;
3553 }
3554
3555 if (isExtractHiElt(Lo, Lo))
3556 Mods |= SISrcMods::OP_SEL_0;
3557
3558 if (isExtractHiElt(Hi, Hi))
3559 Mods |= SISrcMods::OP_SEL_1;
3560
3561 unsigned VecSize = Src.getValueSizeInBits();
3562 Lo = stripExtractLoElt(Lo);
3563 Hi = stripExtractLoElt(Hi);
3564
3565 if (Lo.getValueSizeInBits() > VecSize) {
3566 Lo = CurDAG->getTargetExtractSubreg(
3567 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3568 MVT::getIntegerVT(VecSize), Lo);
3569 }
3570
3571 if (Hi.getValueSizeInBits() > VecSize) {
3572 Hi = CurDAG->getTargetExtractSubreg(
3573 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3574 MVT::getIntegerVT(VecSize), Hi);
3575 }
3576
3577 assert(Lo.getValueSizeInBits() <= VecSize &&
3578 Hi.getValueSizeInBits() <= VecSize);
3579
3580 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3581 // Really a scalar input. Just select from the low half of the register to
3582 // avoid packing.
3583
3584 if (VecSize == Lo.getValueSizeInBits()) {
3585 Src = Lo;
3586 } else if (VecSize == 32) {
3587 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3588 } else {
3589 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3590
3591 SDLoc SL(In);
3593 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3594 Lo.getValueType()), 0);
3595 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3596 : AMDGPU::SReg_64RegClassID;
3597 const SDValue Ops[] = {
3598 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3599 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3600 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3601
3602 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3603 Src.getValueType(), Ops), 0);
3604 }
3605 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3606 return true;
3607 }
3608
3609 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3610 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3611 .bitcastToAPInt().getZExtValue();
3612 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3613 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3614 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3615 return true;
3616 }
3617 }
3618
3619 Mods = VecMods;
3620 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3621 Src.getNumOperands() == 2) {
3622
3623 // TODO: We should repeat the build_vector source check above for the
3624 // vector_shuffle for negates and casts of individual elements.
3625
3626 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3627 ArrayRef<int> Mask = SVN->getMask();
3628
3629 if (Mask[0] < 2 && Mask[1] < 2) {
3630 // src1 should be undef.
3631 SDValue ShuffleSrc = SVN->getOperand(0);
3632
3633 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3634 ShuffleSrc = ShuffleSrc.getOperand(0);
3636 }
3637
3638 if (Mask[0] == 1)
3639 Mods |= SISrcMods::OP_SEL_0;
3640 if (Mask[1] == 1)
3641 Mods |= SISrcMods::OP_SEL_1;
3642
3643 Src = ShuffleSrc;
3644 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3645 return true;
3646 }
3647 }
3648
3649 // Packed instructions do not have abs modifiers.
3650 Mods |= SISrcMods::OP_SEL_1;
3651
3652 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3653 return true;
3654}
3655
3656bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3657 SDValue &SrcMods) const {
3658 return SelectVOP3PMods(In, Src, SrcMods, true);
3659}
3660
3661bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3662 SDValue &Src) const {
3663 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3664 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3665
3666 unsigned Mods = SISrcMods::OP_SEL_1;
3667 unsigned SrcVal = C->getZExtValue();
3668 if (SrcVal == 1)
3669 Mods |= SISrcMods::OP_SEL_0;
3670
3671 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3672 return true;
3673}
3674
3676 llvm::SelectionDAG *CurDAG,
3677 const SDLoc &DL) {
3678 unsigned DstRegClass;
3679 EVT DstTy;
3680 switch (Elts.size()) {
3681 case 8:
3682 DstRegClass = AMDGPU::VReg_256RegClassID;
3683 DstTy = MVT::v8i32;
3684 break;
3685 case 4:
3686 DstRegClass = AMDGPU::VReg_128RegClassID;
3687 DstTy = MVT::v4i32;
3688 break;
3689 case 2:
3690 DstRegClass = AMDGPU::VReg_64RegClassID;
3691 DstTy = MVT::v2i32;
3692 break;
3693 default:
3694 llvm_unreachable("unhandled Reg sequence size");
3695 }
3696
3698 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3699 for (unsigned i = 0; i < Elts.size(); ++i) {
3700 Ops.push_back(Elts[i]);
3701 Ops.push_back(CurDAG->getTargetConstant(
3703 }
3704 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3705}
3706
3708 llvm::SelectionDAG *CurDAG,
3709 const SDLoc &DL) {
3710 SmallVector<SDValue, 8> PackedElts;
3711 assert("unhandled Reg sequence size" &&
3712 (Elts.size() == 8 || Elts.size() == 16));
3713
3714 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3715 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3716 for (unsigned i = 0; i < Elts.size(); i += 2) {
3717 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3718 SDValue HiSrc;
3719 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3720 PackedElts.push_back(HiSrc);
3721 } else {
3722 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3723 MachineSDNode *Packed =
3724 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3725 {Elts[i + 1], Elts[i], PackLoLo});
3726 PackedElts.push_back(SDValue(Packed, 0));
3727 }
3728 }
3729
3730 return buildRegSequence32(PackedElts, CurDAG, DL);
3731}
3732
3734 llvm::SelectionDAG *CurDAG,
3735 const SDLoc &DL, unsigned ElementSize) {
3736 if (ElementSize == 16)
3737 return buildRegSequence16(Elts, CurDAG, DL);
3738 if (ElementSize == 32)
3739 return buildRegSequence32(Elts, CurDAG, DL);
3740 llvm_unreachable("Unhandled element size");
3741}
3742
3743static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3745 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3746 unsigned ElementSize) {
3747 if (ModOpcode == ISD::FNEG) {
3748 Mods |= SISrcMods::NEG;
3749 // Check if all elements also have abs modifier
3750 SmallVector<SDValue, 8> NegAbsElts;
3751 for (auto El : Elts) {
3752 if (El.getOpcode() != ISD::FABS)
3753 break;
3754 NegAbsElts.push_back(El->getOperand(0));
3755 }
3756 if (Elts.size() != NegAbsElts.size()) {
3757 // Neg
3758 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3759 } else {
3760 // Neg and Abs
3761 Mods |= SISrcMods::NEG_HI;
3762 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3763 }
3764 } else {
3765 assert(ModOpcode == ISD::FABS);
3766 // Abs
3767 Mods |= SISrcMods::NEG_HI;
3768 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3769 }
3770}
3771
3772// Check all f16 elements for modifiers while looking through b32 and v2b16
3773// build vector, stop if element does not satisfy ModifierCheck.
3774static void
3776 std::function<bool(SDValue)> ModifierCheck) {
3777 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3778 if (auto *F16Pair =
3779 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3780 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3781 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3782 if (!ModifierCheck(ElF16))
3783 break;
3784 }
3785 }
3786 }
3787}
3788
3789bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3790 SDValue &SrcMods) const {
3791 Src = In;
3792 unsigned Mods = SISrcMods::OP_SEL_1;
3793
3794 // mods are on f16 elements
3795 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3797
3798 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3799 if (Element.getOpcode() != ISD::FNEG)
3800 return false;
3801 EltsF16.push_back(Element.getOperand(0));
3802 return true;
3803 });
3804
3805 // All elements have neg modifier
3806 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3807 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3808 Mods |= SISrcMods::NEG;
3809 Mods |= SISrcMods::NEG_HI;
3810 }
3811 }
3812
3813 // mods are on v2f16 elements
3814 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3815 SmallVector<SDValue, 8> EltsV2F16;
3816 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3817 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3818 // Based on first element decide which mod we match, neg or abs
3819 if (ElV2f16.getOpcode() != ISD::FNEG)
3820 break;
3821 EltsV2F16.push_back(ElV2f16.getOperand(0));
3822 }
3823
3824 // All pairs of elements have neg modifier
3825 if (BV->getNumOperands() == EltsV2F16.size()) {
3826 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3827 Mods |= SISrcMods::NEG;
3828 Mods |= SISrcMods::NEG_HI;
3829 }
3830 }
3831
3832 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3833 return true;
3834}
3835
3836bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3837 SDValue &SrcMods) const {
3838 Src = In;
3839 unsigned Mods = SISrcMods::OP_SEL_1;
3840 unsigned ModOpcode;
3841
3842 // mods are on f16 elements
3843 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3845 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3846 // Based on first element decide which mod we match, neg or abs
3847 if (EltsF16.empty())
3848 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3849 if (ElF16.getOpcode() != ModOpcode)
3850 return false;
3851 EltsF16.push_back(ElF16.getOperand(0));
3852 return true;
3853 });
3854
3855 // All elements have ModOpcode modifier
3856 if (BV->getNumOperands() * 2 == EltsF16.size())
3857 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3858 16);
3859 }
3860
3861 // mods are on v2f16 elements
3862 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3863 SmallVector<SDValue, 8> EltsV2F16;
3864
3865 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3866 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3867 // Based on first element decide which mod we match, neg or abs
3868 if (EltsV2F16.empty())
3869 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3870 if (ElV2f16->getOpcode() != ModOpcode)
3871 break;
3872 EltsV2F16.push_back(ElV2f16->getOperand(0));
3873 }
3874
3875 // All elements have ModOpcode modifier
3876 if (BV->getNumOperands() == EltsV2F16.size())
3877 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3878 32);
3879 }
3880
3881 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3882 return true;
3883}
3884
3885bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3886 SDValue &SrcMods) const {
3887 Src = In;
3888 unsigned Mods = SISrcMods::OP_SEL_1;
3890
3891 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3892 assert(BV->getNumOperands() > 0);
3893 // Based on first element decide which mod we match, neg or abs
3894 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3895 unsigned ModOpcode =
3896 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3897 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3898 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3899 if (ElF32.getOpcode() != ModOpcode)
3900 break;
3901 EltsF32.push_back(ElF32.getOperand(0));
3902 }
3903
3904 // All elements had ModOpcode modifier
3905 if (BV->getNumOperands() == EltsF32.size())
3906 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3907 32);
3908 }
3909
3910 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3911 return true;
3912}
3913
3914bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3915 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3916 BitVector UndefElements;
3917 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3918 if (isInlineImmediate(Splat.getNode())) {
3919 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3920 unsigned Imm = C->getAPIntValue().getSExtValue();
3921 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3922 return true;
3923 }
3924 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3925 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3926 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3927 return true;
3928 }
3929 llvm_unreachable("unhandled Constant node");
3930 }
3931 }
3932
3933 // 16 bit splat
3934 SDValue SplatSrc32 = stripBitcast(In);
3935 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3936 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3937 SDValue SplatSrc16 = stripBitcast(Splat32);
3938 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3939 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3940 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3941 std::optional<APInt> RawValue;
3942 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3943 RawValue = C->getValueAPF().bitcastToAPInt();
3944 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3945 RawValue = C->getAPIntValue();
3946
3947 if (RawValue.has_value()) {
3948 EVT VT = In.getValueType().getScalarType();
3949 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3950 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3953 RawValue.value());
3954 if (TII->isInlineConstant(FloatVal)) {
3955 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3956 MVT::i16);
3957 return true;
3958 }
3959 } else if (VT.getSimpleVT() == MVT::i16) {
3960 if (TII->isInlineConstant(RawValue.value())) {
3961 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3962 MVT::i16);
3963 return true;
3964 }
3965 } else
3966 llvm_unreachable("unknown 16-bit type");
3967 }
3968 }
3969 }
3970
3971 return false;
3972}
3973
3974bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3975 SDValue &IndexKey) const {
3976 unsigned Key = 0;
3977 Src = In;
3978
3979 if (In.getOpcode() == ISD::SRL) {
3980 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3981 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3982 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3983 ShiftAmt->getZExtValue() % 8 == 0) {
3984 Key = ShiftAmt->getZExtValue() / 8;
3985 Src = ShiftSrc;
3986 }
3987 }
3988
3989 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3990 return true;
3991}
3992
3993bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3994 SDValue &IndexKey) const {
3995 unsigned Key = 0;
3996 Src = In;
3997
3998 if (In.getOpcode() == ISD::SRL) {
3999 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4000 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4001 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4002 ShiftAmt->getZExtValue() == 16) {
4003 Key = 1;
4004 Src = ShiftSrc;
4005 }
4006 }
4007
4008 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4009 return true;
4010}
4011
4012bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4013 SDValue &IndexKey) const {
4014 unsigned Key = 0;
4015 Src = In;
4016
4017 SDValue InI32;
4018
4019 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4020 const SDValue &ExtendSrc = In.getOperand(0);
4021 if (ExtendSrc.getValueSizeInBits() == 32)
4022 InI32 = ExtendSrc;
4023 } else if (In->getOpcode() == ISD::BITCAST) {
4024 const SDValue &CastSrc = In.getOperand(0);
4025 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4026 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4027 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4028 if (Zero && Zero->getZExtValue() == 0)
4029 InI32 = CastSrc.getOperand(0);
4030 }
4031 }
4032
4033 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4034 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4035 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4036 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4037 EltIdx->getZExtValue() == 1) {
4038 Key = 1;
4039 Src = ExtractVecEltSrc;
4040 }
4041 }
4042
4043 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4044 return true;
4045}
4046
4047bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4048 SDValue &SrcMods) const {
4049 Src = In;
4050 // FIXME: Handle op_sel
4051 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4052 return true;
4053}
4054
4055bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4056 SDValue &SrcMods) const {
4057 // FIXME: Handle op_sel
4058 return SelectVOP3Mods(In, Src, SrcMods);
4059}
4060
4061// Match lowered fpext from bf16 to f32. This is a bit operation extending
4062// a 16-bit value with 16-bit of zeroes at LSB:
4063//
4064// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4065// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4066// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4067static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4068 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4069 return SDValue();
4070 Op = Op.getOperand(0);
4071
4072 IsExtractHigh = false;
4073 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4074 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4075 if (!Low16 || !Low16->isZero())
4076 return SDValue();
4077 Op = stripBitcast(Op.getOperand(1));
4078 if (Op.getValueType() != MVT::bf16)
4079 return SDValue();
4080 return Op;
4081 }
4082
4083 if (Op.getValueType() != MVT::i32)
4084 return SDValue();
4085
4086 if (Op.getOpcode() == ISD::AND) {
4087 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4088 if (Mask->getZExtValue() == 0xffff0000) {
4089 IsExtractHigh = true;
4090 return Op.getOperand(0);
4091 }
4092 }
4093 return SDValue();
4094 }
4095
4096 if (Op.getOpcode() == ISD::SHL) {
4097 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4098 if (Amt->getZExtValue() == 16)
4099 return Op.getOperand(0);
4100 }
4101 }
4102
4103 return SDValue();
4104}
4105
4106// The return value is not whether the match is possible (which it always is),
4107// but whether or not it a conversion is really used.
4108bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4109 unsigned &Mods,
4110 MVT VT) const {
4111 Mods = 0;
4112 SelectVOP3ModsImpl(In, Src, Mods);
4113
4114 bool IsExtractHigh = false;
4115 if (Src.getOpcode() == ISD::FP_EXTEND) {
4116 Src = Src.getOperand(0);
4117 } else if (VT == MVT::bf16) {
4118 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4119 if (!B16)
4120 return false;
4121 Src = B16;
4122 } else
4123 return false;
4124
4125 if (Src.getValueType() != VT &&
4126 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4127 return false;
4128
4129 Src = stripBitcast(Src);
4130
4131 // Be careful about folding modifiers if we already have an abs. fneg is
4132 // applied last, so we don't want to apply an earlier fneg.
4133 if ((Mods & SISrcMods::ABS) == 0) {
4134 unsigned ModsTmp;
4135 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4136
4137 if ((ModsTmp & SISrcMods::NEG) != 0)
4138 Mods ^= SISrcMods::NEG;
4139
4140 if ((ModsTmp & SISrcMods::ABS) != 0)
4141 Mods |= SISrcMods::ABS;
4142 }
4143
4144 // op_sel/op_sel_hi decide the source type and source.
4145 // If the source's op_sel_hi is set, it indicates to do a conversion from
4146 // fp16. If the sources's op_sel is set, it picks the high half of the source
4147 // register.
4148
4149 Mods |= SISrcMods::OP_SEL_1;
4150 if (Src.getValueSizeInBits() == 16) {
4151 if (isExtractHiElt(Src, Src)) {
4152 Mods |= SISrcMods::OP_SEL_0;
4153
4154 // TODO: Should we try to look for neg/abs here?
4155 return true;
4156 }
4157
4158 if (Src.getOpcode() == ISD::TRUNCATE &&
4159 Src.getOperand(0).getValueType() == MVT::i32) {
4160 Src = Src.getOperand(0);
4161 return true;
4162 }
4163
4164 if (Subtarget->useRealTrue16Insts())
4165 // In true16 mode, pack src to a 32bit
4166 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4167 } else if (IsExtractHigh)
4168 Mods |= SISrcMods::OP_SEL_0;
4169
4170 return true;
4171}
4172
4173bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4174 SDValue &SrcMods) const {
4175 unsigned Mods = 0;
4176 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4177 return false;
4178 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4179 return true;
4180}
4181
4182bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4183 SDValue &SrcMods) const {
4184 unsigned Mods = 0;
4185 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4186 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4187 return true;
4188}
4189
4190bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4191 SDValue &SrcMods) const {
4192 unsigned Mods = 0;
4193 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4194 return false;
4195 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4196 return true;
4197}
4198
4199bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4200 SDValue &SrcMods) const {
4201 unsigned Mods = 0;
4202 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4203 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4204 return true;
4205}
4206
4207// Match BITOP3 operation and return a number of matched instructions plus
4208// truth table.
4209static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4211 unsigned NumOpcodes = 0;
4212 uint8_t LHSBits, RHSBits;
4213
4214 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4215 // Define truth table given Src0, Src1, Src2 bits permutations:
4216 // 0 0 0
4217 // 0 0 1
4218 // 0 1 0
4219 // 0 1 1
4220 // 1 0 0
4221 // 1 0 1
4222 // 1 1 0
4223 // 1 1 1
4224 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4225
4226 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4227 if (C->isAllOnes()) {
4228 Bits = 0xff;
4229 return true;
4230 }
4231 if (C->isZero()) {
4232 Bits = 0;
4233 return true;
4234 }
4235 }
4236
4237 for (unsigned I = 0; I < Src.size(); ++I) {
4238 // Try to find existing reused operand
4239 if (Src[I] == Op) {
4240 Bits = SrcBits[I];
4241 return true;
4242 }
4243 // Try to replace parent operator
4244 if (Src[I] == In) {
4245 Bits = SrcBits[I];
4246 Src[I] = Op;
4247 return true;
4248 }
4249 }
4250
4251 if (Src.size() == 3) {
4252 // No room left for operands. Try one last time, there can be a 'not' of
4253 // one of our source operands. In this case we can compute the bits
4254 // without growing Src vector.
4255 if (Op.getOpcode() == ISD::XOR) {
4256 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4257 if (C->isAllOnes()) {
4258 SDValue LHS = Op.getOperand(0);
4259 for (unsigned I = 0; I < Src.size(); ++I) {
4260 if (Src[I] == LHS) {
4261 Bits = ~SrcBits[I];
4262 return true;
4263 }
4264 }
4265 }
4266 }
4267 }
4268
4269 return false;
4270 }
4271
4272 Bits = SrcBits[Src.size()];
4273 Src.push_back(Op);
4274 return true;
4275 };
4276
4277 switch (In.getOpcode()) {
4278 case ISD::AND:
4279 case ISD::OR:
4280 case ISD::XOR: {
4281 SDValue LHS = In.getOperand(0);
4282 SDValue RHS = In.getOperand(1);
4283
4284 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4285 if (!getOperandBits(LHS, LHSBits) ||
4286 !getOperandBits(RHS, RHSBits)) {
4287 Src = Backup;
4288 return std::make_pair(0, 0);
4289 }
4290
4291 // Recursion is naturally limited by the size of the operand vector.
4292 auto Op = BitOp3_Op(LHS, Src);
4293 if (Op.first) {
4294 NumOpcodes += Op.first;
4295 LHSBits = Op.second;
4296 }
4297
4298 Op = BitOp3_Op(RHS, Src);
4299 if (Op.first) {
4300 NumOpcodes += Op.first;
4301 RHSBits = Op.second;
4302 }
4303 break;
4304 }
4305 default:
4306 return std::make_pair(0, 0);
4307 }
4308
4309 uint8_t TTbl;
4310 switch (In.getOpcode()) {
4311 case ISD::AND:
4312 TTbl = LHSBits & RHSBits;
4313 break;
4314 case ISD::OR:
4315 TTbl = LHSBits | RHSBits;
4316 break;
4317 case ISD::XOR:
4318 TTbl = LHSBits ^ RHSBits;
4319 break;
4320 default:
4321 break;
4322 }
4323
4324 return std::make_pair(NumOpcodes + 1, TTbl);
4325}
4326
4327bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4328 SDValue &Src2, SDValue &Tbl) const {
4330 uint8_t TTbl;
4331 unsigned NumOpcodes;
4332
4333 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4334
4335 // Src.empty() case can happen if all operands are all zero or all ones.
4336 // Normally it shall be optimized out before reaching this.
4337 if (NumOpcodes < 2 || Src.empty())
4338 return false;
4339
4340 // For a uniform case threshold should be higher to account for moves between
4341 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4342 // and a readtfirstlane after.
4343 if (NumOpcodes < 4 && !In->isDivergent())
4344 return false;
4345
4346 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4347 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4348 // asm more readable. This cannot be modeled with AddedComplexity because
4349 // selector does not know how many operations did we match.
4350 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4351 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4352 In.getOperand(1).getOpcode() == In.getOpcode()))
4353 return false;
4354
4355 if (In.getOpcode() == ISD::OR &&
4356 (In.getOperand(0).getOpcode() == ISD::AND ||
4357 In.getOperand(1).getOpcode() == ISD::AND))
4358 return false;
4359 }
4360
4361 // Last operand can be ignored, turning a ternary operation into a binary.
4362 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4363 // 'c' with 'a' here without changing the answer. In some pathological
4364 // cases it should be possible to get an operation with a single operand
4365 // too if optimizer would not catch it.
4366 while (Src.size() < 3)
4367 Src.push_back(Src[0]);
4368
4369 Src0 = Src[0];
4370 Src1 = Src[1];
4371 Src2 = Src[2];
4372
4373 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4374 return true;
4375}
4376
4377SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4378 if (In.isUndef())
4379 return CurDAG->getUNDEF(MVT::i32);
4380
4381 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4382 SDLoc SL(In);
4383 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4384 }
4385
4386 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4387 SDLoc SL(In);
4388 return CurDAG->getConstant(
4389 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4390 }
4391
4392 SDValue Src;
4393 if (isExtractHiElt(In, Src))
4394 return Src;
4395
4396 return SDValue();
4397}
4398
4399bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4400 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4401
4402 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4403 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4404
4405 unsigned Limit = 0;
4406 bool AllUsesAcceptSReg = true;
4407 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4408 Limit < 10 && U != E; ++U, ++Limit) {
4409 const TargetRegisterClass *RC =
4410 getOperandRegClass(U->getUser(), U->getOperandNo());
4411
4412 // If the register class is unknown, it could be an unknown
4413 // register class that needs to be an SGPR, e.g. an inline asm
4414 // constraint
4415 if (!RC || SIRI->isSGPRClass(RC))
4416 return false;
4417
4418 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4419 RC != &AMDGPU::VS_64_Align2RegClass) {
4420 AllUsesAcceptSReg = false;
4421 SDNode *User = U->getUser();
4422 if (User->isMachineOpcode()) {
4423 unsigned Opc = User->getMachineOpcode();
4424 const MCInstrDesc &Desc = SII->get(Opc);
4425 if (Desc.isCommutable()) {
4426 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4427 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4428 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4429 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4430 const TargetRegisterClass *CommutedRC =
4431 getOperandRegClass(U->getUser(), CommutedOpNo);
4432 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4433 CommutedRC == &AMDGPU::VS_64RegClass ||
4434 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4435 AllUsesAcceptSReg = true;
4436 }
4437 }
4438 }
4439 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4440 // commuting current user. This means have at least one use
4441 // that strictly require VGPR. Thus, we will not attempt to commute
4442 // other user instructions.
4443 if (!AllUsesAcceptSReg)
4444 break;
4445 }
4446 }
4447 return !AllUsesAcceptSReg && (Limit < 10);
4448}
4449
4450bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4451 const auto *Ld = cast<LoadSDNode>(N);
4452 const MachineMemOperand *MMO = Ld->getMemOperand();
4453
4454 if (Ld->isDivergent()) {
4455 // FIXME: We ought to able able to take the direct isDivergent result. We
4456 // cannot rely on the MMO for a uniformity check, and should stop using
4457 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4458 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4459 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4460 // version, and then this can be dropped.
4461 if (!MMO->getValue() || !AMDGPU::isUniformMMO(MMO))
4462 return false;
4463 }
4464
4465 return MMO->getSize().hasValue() &&
4466 Ld->getAlign() >=
4467 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4468 uint64_t(4))) &&
4469 (MMO->isInvariant() ||
4470 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4471 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4472 (Subtarget->getScalarizeGlobalBehavior() &&
4473 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4474 Ld->isSimple() &&
4475 static_cast<const SITargetLowering *>(getTargetLowering())
4476 ->isMemOpHasNoClobberedMemOperand(N)));
4477}
4478
4481 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4482 bool IsModified = false;
4483 do {
4484 IsModified = false;
4485
4486 // Go over all selected nodes and try to fold them a bit more
4487 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4488 while (Position != CurDAG->allnodes_end()) {
4489 SDNode *Node = &*Position++;
4491 if (!MachineNode)
4492 continue;
4493
4494 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4495 if (ResNode != Node) {
4496 if (ResNode)
4497 ReplaceUses(Node, ResNode);
4498 IsModified = true;
4499 }
4500 }
4501 CurDAG->RemoveDeadNodes();
4502 } while (IsModified);
4503}
4504
4509
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:114
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static const fltSemantics & BFloat()
Definition APFloat.h:295
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1657
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
const Value * getValue() const
Return the base address of the memory access.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ TargetFrameIndex
Definition ISDOpcodes.h:182
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isExtOpcode(unsigned Opcode)
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ Undef
Value of the register doesn't matter.
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:867
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.