LLVM 23.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,
133 EVT DstTy, ArrayRef<SDValue> Elts,
134 ArrayRef<unsigned> SubRegClass,
135 const SDLoc &DL) {
136 assert(Elts.size() == SubRegClass.size() && "array size mismatch");
137 unsigned NumElts = Elts.size();
138 SmallVector<SDValue, 17> Ops(2 * NumElts + 1);
139 Ops[0] = (CurDAG.getTargetConstant(DstRegClass, DL, MVT::i32));
140 for (unsigned i = 0; i < NumElts; ++i) {
141 Ops[2 * i + 1] = Elts[i];
142 Ops[2 * i + 2] = CurDAG.getTargetConstant(SubRegClass[i], DL, MVT::i32);
143 }
144 return SDValue(
145 CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops), 0);
146}
147
148} // end anonymous namespace
149
151 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
152 false)
153INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
155#ifdef EXPENSIVE_CHECKS
158#endif
160 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
161 false)
162
163/// This pass converts a legalized DAG into a AMDGPU-specific
164// DAG, ready for instruction scheduling.
166 CodeGenOptLevel OptLevel) {
167 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
168}
169
173
175 Subtarget = &MF.getSubtarget<GCNSubtarget>();
176 Subtarget->checkSubtargetFeatures(MF.getFunction());
177 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
179}
180
181bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
182 // XXX - only need to list legal operations.
183 switch (Opc) {
184 case ISD::FADD:
185 case ISD::FSUB:
186 case ISD::FMUL:
187 case ISD::FDIV:
188 case ISD::FREM:
190 case ISD::UINT_TO_FP:
191 case ISD::SINT_TO_FP:
192 case ISD::FABS:
193 // Fabs is lowered to a bit operation, but it's an and which will clear the
194 // high bits anyway.
195 case ISD::FSQRT:
196 case ISD::FSIN:
197 case ISD::FCOS:
198 case ISD::FPOWI:
199 case ISD::FPOW:
200 case ISD::FLOG:
201 case ISD::FLOG2:
202 case ISD::FLOG10:
203 case ISD::FEXP:
204 case ISD::FEXP2:
205 case ISD::FCEIL:
206 case ISD::FTRUNC:
207 case ISD::FRINT:
208 case ISD::FNEARBYINT:
209 case ISD::FROUNDEVEN:
210 case ISD::FROUND:
211 case ISD::FFLOOR:
212 case ISD::FMINNUM:
213 case ISD::FMAXNUM:
214 case ISD::FLDEXP:
215 case AMDGPUISD::FRACT:
216 case AMDGPUISD::CLAMP:
217 case AMDGPUISD::COS_HW:
218 case AMDGPUISD::SIN_HW:
219 case AMDGPUISD::FMIN3:
220 case AMDGPUISD::FMAX3:
221 case AMDGPUISD::FMED3:
222 case AMDGPUISD::FMAD_FTZ:
223 case AMDGPUISD::RCP:
224 case AMDGPUISD::RSQ:
225 case AMDGPUISD::RCP_IFLAG:
226 // On gfx10, all 16-bit instructions preserve the high bits.
227 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
228 case ISD::FP_ROUND:
229 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
230 // high bits on gfx9.
231 // TODO: If we had the source node we could see if the source was fma/mad
233 case ISD::FMA:
234 case ISD::FMAD:
235 case AMDGPUISD::DIV_FIXUP:
237 default:
238 // fcopysign, select and others may be lowered to 32-bit bit operations
239 // which don't zero the high bits.
240 return false;
241 }
242}
243
245#ifdef EXPENSIVE_CHECKS
247 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248 for (auto &L : LI->getLoopsInPreorder()) {
249 assert(L->isLCSSAForm(DT));
250 }
251#endif
253}
254
263
265 assert(Subtarget->d16PreservesUnusedBits());
266 MVT VT = N->getValueType(0).getSimpleVT();
267 if (VT != MVT::v2i16 && VT != MVT::v2f16)
268 return false;
269
270 SDValue Lo = N->getOperand(0);
271 SDValue Hi = N->getOperand(1);
272
273 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
274
275 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
276 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
277 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
278
279 // Need to check for possible indirect dependencies on the other half of the
280 // vector to avoid introducing a cycle.
281 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
282 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
283
284 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
285 SDValue Ops[] = {
286 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
287 };
288
289 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
290 if (LdHi->getMemoryVT() == MVT::i8) {
291 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
292 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
293 } else {
294 assert(LdHi->getMemoryVT() == MVT::i16);
295 }
296
297 SDValue NewLoadHi =
298 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
299 Ops, LdHi->getMemoryVT(),
300 LdHi->getMemOperand());
301
302 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
303 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
304 return true;
305 }
306
307 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
308 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
309 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
310 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
311 if (LdLo && Lo.hasOneUse()) {
312 SDValue TiedIn = getHi16Elt(Hi);
313 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
314 return false;
315
316 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
317 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
318 if (LdLo->getMemoryVT() == MVT::i8) {
319 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
320 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
321 } else {
322 assert(LdLo->getMemoryVT() == MVT::i16);
323 }
324
325 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
326
327 SDValue Ops[] = {
328 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
329 };
330
331 SDValue NewLoadLo =
332 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
333 Ops, LdLo->getMemoryVT(),
334 LdLo->getMemOperand());
335
336 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
337 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
338 return true;
339 }
340
341 return false;
342}
343
345 if (!Subtarget->d16PreservesUnusedBits())
346 return;
347
348 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
349
350 bool MadeChange = false;
351 while (Position != CurDAG->allnodes_begin()) {
352 SDNode *N = &*--Position;
353 if (N->use_empty())
354 continue;
355
356 switch (N->getOpcode()) {
358 // TODO: Match load d16 from shl (extload:i16), 16
359 MadeChange |= matchLoadD16FromBuildVector(N);
360 break;
361 default:
362 break;
363 }
364 }
365
366 if (MadeChange) {
367 CurDAG->RemoveDeadNodes();
368 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
369 CurDAG->dump(););
370 }
371}
372
373bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
374 if (N->isUndef())
375 return true;
376
377 const SIInstrInfo *TII = Subtarget->getInstrInfo();
379 return TII->isInlineConstant(C->getAPIntValue());
380
382 return TII->isInlineConstant(C->getValueAPF());
383
384 return false;
385}
386
387/// Determine the register class for \p OpNo
388/// \returns The register class of the virtual register that will be used for
389/// the given operand number \OpNo or NULL if the register class cannot be
390/// determined.
391const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
392 unsigned OpNo) const {
393 if (!N->isMachineOpcode()) {
394 if (N->getOpcode() == ISD::CopyToReg) {
395 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
396 if (Reg.isVirtual()) {
398 return MRI.getRegClass(Reg);
399 }
400
401 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
402 return TRI->getPhysRegBaseClass(Reg);
403 }
404
405 return nullptr;
406 }
407
408 switch (N->getMachineOpcode()) {
409 default: {
410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
411 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
412 unsigned OpIdx = Desc.getNumDefs() + OpNo;
413 if (OpIdx >= Desc.getNumOperands())
414 return nullptr;
415
416 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
417 if (RegClass == -1)
418 return nullptr;
419
420 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
421 }
422 case AMDGPU::REG_SEQUENCE: {
423 unsigned RCID = N->getConstantOperandVal(0);
424 const TargetRegisterClass *SuperRC =
425 Subtarget->getRegisterInfo()->getRegClass(RCID);
426
427 SDValue SubRegOp = N->getOperand(OpNo + 1);
428 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
429 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
430 SubRegIdx);
431 }
432 }
433}
434
435SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
436 SDValue Glue) const {
438 Ops.push_back(NewChain); // Replace the chain.
439 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
440 Ops.push_back(N->getOperand(i));
441
442 Ops.push_back(Glue);
443 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
444}
445
446SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
447 const SITargetLowering& Lowering =
448 *static_cast<const SITargetLowering*>(getTargetLowering());
449
450 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
451
452 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
453 return glueCopyToOp(N, M0, M0.getValue(1));
454}
455
456SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
457 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
458 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
459 if (Subtarget->ldsRequiresM0Init())
460 return glueCopyToM0(
461 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
462 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
463 MachineFunction &MF = CurDAG->getMachineFunction();
464 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
465 return
466 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
467 }
468 return N;
469}
470
471MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
472 EVT VT) const {
473 SDNode *Lo = CurDAG->getMachineNode(
474 AMDGPU::S_MOV_B32, DL, MVT::i32,
475 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
476 SDNode *Hi = CurDAG->getMachineNode(
477 AMDGPU::S_MOV_B32, DL, MVT::i32,
478 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
479 const SDValue Ops[] = {
480 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
481 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
482 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
483
484 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
485}
486
487SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
488 SelectionDAG &DAG) const {
489 // TODO: Handle undef as zero
490
491 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
492 uint32_t LHSVal, RHSVal;
493 if (getConstantValue(N->getOperand(0), LHSVal) &&
494 getConstantValue(N->getOperand(1), RHSVal)) {
495 SDLoc SL(N);
496 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
497 return DAG.getMachineNode(
498 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
499 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
500 }
501
502 return nullptr;
503}
504
505void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
506 EVT VT = N->getValueType(0);
507 unsigned NumVectorElts = VT.getVectorNumElements();
508 EVT EltVT = VT.getVectorElementType();
509 SDLoc DL(N);
510 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
511
512 if (NumVectorElts == 1) {
513 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
514 RegClass);
515 return;
516 }
517
518 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
519 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
520 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
521 uint64_t C = 0;
522 bool AllConst = true;
523 unsigned EltSize = EltVT.getSizeInBits();
524 for (unsigned I = 0; I < NumVectorElts; ++I) {
525 SDValue Op = N->getOperand(I);
526 if (Op.isUndef()) {
527 AllConst = false;
528 break;
529 }
530 uint64_t Val;
532 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
533 } else
534 Val = cast<ConstantSDNode>(Op)->getZExtValue();
535 C |= Val << (EltSize * I);
536 }
537 if (AllConst) {
538 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
539 MachineSDNode *Copy =
540 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
541 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
542 RegClass);
543 return;
544 }
545 }
546
547 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
548 "supported yet");
549 // 32 = Max Num Vector Elements
550 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
551 // 1 = Vector Register Class
552 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
553
554 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
555 bool IsRegSeq = true;
556 unsigned NOps = N->getNumOperands();
557 for (unsigned i = 0; i < NOps; i++) {
558 // XXX: Why is this here?
559 if (isa<RegisterSDNode>(N->getOperand(i))) {
560 IsRegSeq = false;
561 break;
562 }
563 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
565 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
566 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
567 }
568 if (NOps != NumVectorElts) {
569 // Fill in the missing undef elements if this was a scalar_to_vector.
570 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
571 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
572 DL, EltVT);
573 for (unsigned i = NOps; i < NumVectorElts; ++i) {
574 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
576 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
577 RegSeqArgs[1 + (2 * i) + 1] =
578 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
579 }
580 }
581
582 if (!IsRegSeq)
583 SelectCode(N);
584 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
585}
586
588 EVT VT = N->getValueType(0);
589 EVT EltVT = VT.getVectorElementType();
590
591 // TODO: Handle 16-bit element vectors with even aligned masks.
592 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
593 VT.getVectorNumElements() != 2) {
594 SelectCode(N);
595 return;
596 }
597
598 auto *SVN = cast<ShuffleVectorSDNode>(N);
599
600 SDValue Src0 = SVN->getOperand(0);
601 SDValue Src1 = SVN->getOperand(1);
602 ArrayRef<int> Mask = SVN->getMask();
603 SDLoc DL(N);
604
605 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
606 Mask[0] < 4 && Mask[1] < 4);
607
608 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
609 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
610 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
611 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
612
613 if (Mask[0] < 0) {
614 Src0SubReg = Src1SubReg;
615 MachineSDNode *ImpDef =
616 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
617 VSrc0 = SDValue(ImpDef, 0);
618 }
619
620 if (Mask[1] < 0) {
621 Src1SubReg = Src0SubReg;
622 MachineSDNode *ImpDef =
623 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
624 VSrc1 = SDValue(ImpDef, 0);
625 }
626
627 // SGPR case needs to lower to copies.
628 //
629 // Also use subregister extract when we can directly blend the registers with
630 // a simple subregister copy.
631 //
632 // TODO: Maybe we should fold this out earlier
633 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
634 Src1SubReg == AMDGPU::sub0) {
635 // The low element of the result always comes from src0.
636 // The high element of the result always comes from src1.
637 // op_sel selects the high half of src0.
638 // op_sel_hi selects the high half of src1.
639
640 unsigned Src0OpSel =
641 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
642 unsigned Src1OpSel =
643 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
644
645 // Enable op_sel_hi to avoid printing it. This should have no effect on the
646 // result.
647 Src0OpSel |= SISrcMods::OP_SEL_1;
648 Src1OpSel |= SISrcMods::OP_SEL_1;
649
650 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
651 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
652 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
653
654 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
655 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
656 ZeroMods, // clamp
657 ZeroMods, // op_sel
658 ZeroMods, // op_sel_hi
659 ZeroMods, // neg_lo
660 ZeroMods}); // neg_hi
661 return;
662 }
663
664 SDValue ResultElt0 =
665 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
666 SDValue ResultElt1 =
667 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
668
669 const SDValue Ops[] = {
670 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
671 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
672 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
673 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
674}
675
677 unsigned int Opc = N->getOpcode();
678 if (N->isMachineOpcode()) {
679 N->setNodeId(-1);
680 return; // Already selected.
681 }
682
683 // isa<MemSDNode> almost works but is slightly too permissive for some DS
684 // intrinsics.
685 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
686 N = glueCopyToM0LDSInit(N);
687 SelectCode(N);
688 return;
689 }
690
691 switch (Opc) {
692 default:
693 break;
694 case ISD::UADDO_CARRY:
695 case ISD::USUBO_CARRY:
696 if (N->getValueType(0) == MVT::i64) {
697 SelectAddcSubbI64(N);
698 return;
699 }
700
701 if (N->getValueType(0) != MVT::i32)
702 break;
703
704 SelectAddcSubb(N);
705 return;
706 case ISD::UADDO:
707 case ISD::USUBO: {
708 if (N->getValueType(0) == MVT::i64) {
709 SelectAddcSubbI64(N);
710 return;
711 }
712
713 SelectUADDO_USUBO(N);
714 return;
715 }
716 case AMDGPUISD::FMUL_W_CHAIN: {
717 SelectFMUL_W_CHAIN(N);
718 return;
719 }
720 case AMDGPUISD::FMA_W_CHAIN: {
721 SelectFMA_W_CHAIN(N);
722 return;
723 }
724
726 case ISD::BUILD_VECTOR: {
727 EVT VT = N->getValueType(0);
728 unsigned NumVectorElts = VT.getVectorNumElements();
729 if (VT.getScalarSizeInBits() == 16) {
730 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
731 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
732 ReplaceNode(N, Packed);
733 return;
734 }
735 }
736
737 break;
738 }
739
740 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
741 assert(VT.getVectorElementType().bitsEq(MVT::i32));
742 const TargetRegisterClass *RegClass =
743 N->isDivergent()
744 ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
745 : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
746
747 SelectBuildVector(N, RegClass->getID());
748 return;
749 }
752 return;
753 case ISD::BUILD_PAIR: {
754 SDValue RC, SubReg0, SubReg1;
755 SDLoc DL(N);
756 if (N->getValueType(0) == MVT::i128) {
757 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
758 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
759 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
760 } else if (N->getValueType(0) == MVT::i64) {
761 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
762 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
763 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
764 } else {
765 llvm_unreachable("Unhandled value type for BUILD_PAIR");
766 }
767 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
768 N->getOperand(1), SubReg1 };
769 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
770 N->getValueType(0), Ops));
771 return;
772 }
773
774 case ISD::Constant:
775 case ISD::ConstantFP: {
776 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
777 Subtarget->has64BitLiterals())
778 break;
779
780 uint64_t Imm;
782 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
783 if (AMDGPU::isValid32BitLiteral(Imm, true))
784 break;
785 } else {
787 Imm = C->getZExtValue();
788 if (AMDGPU::isValid32BitLiteral(Imm, false))
789 break;
790 }
791
792 SDLoc DL(N);
793 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
794 return;
795 }
796 case AMDGPUISD::BFE_I32:
797 case AMDGPUISD::BFE_U32: {
798 // There is a scalar version available, but unlike the vector version which
799 // has a separate operand for the offset and width, the scalar version packs
800 // the width and offset into a single operand. Try to move to the scalar
801 // version if the offsets are constant, so that we can try to keep extended
802 // loads of kernel arguments in SGPRs.
803
804 // TODO: Technically we could try to pattern match scalar bitshifts of
805 // dynamic values, but it's probably not useful.
807 if (!Offset)
808 break;
809
810 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
811 if (!Width)
812 break;
813
814 bool Signed = Opc == AMDGPUISD::BFE_I32;
815
816 uint32_t OffsetVal = Offset->getZExtValue();
817 uint32_t WidthVal = Width->getZExtValue();
818
819 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
820 WidthVal));
821 return;
822 }
823 case AMDGPUISD::DIV_SCALE: {
824 SelectDIV_SCALE(N);
825 return;
826 }
829 SelectMAD_64_32(N);
830 return;
831 }
832 case ISD::SMUL_LOHI:
833 case ISD::UMUL_LOHI:
834 return SelectMUL_LOHI(N);
835 case ISD::CopyToReg: {
837 *static_cast<const SITargetLowering*>(getTargetLowering());
838 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
839 break;
840 }
841 case ISD::AND:
842 case ISD::SRL:
843 case ISD::SRA:
845 if (N->getValueType(0) != MVT::i32)
846 break;
847
848 SelectS_BFE(N);
849 return;
850 case ISD::BRCOND:
851 SelectBRCOND(N);
852 return;
853 case ISD::FP_EXTEND:
854 SelectFP_EXTEND(N);
855 return;
856 case AMDGPUISD::CVT_PKRTZ_F16_F32:
857 case AMDGPUISD::CVT_PKNORM_I16_F32:
858 case AMDGPUISD::CVT_PKNORM_U16_F32:
859 case AMDGPUISD::CVT_PK_U16_U32:
860 case AMDGPUISD::CVT_PK_I16_I32: {
861 // Hack around using a legal type if f16 is illegal.
862 if (N->getValueType(0) == MVT::i32) {
863 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
864 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
865 { N->getOperand(0), N->getOperand(1) });
866 SelectCode(N);
867 return;
868 }
869
870 break;
871 }
873 SelectINTRINSIC_W_CHAIN(N);
874 return;
875 }
877 SelectINTRINSIC_WO_CHAIN(N);
878 return;
879 }
880 case ISD::INTRINSIC_VOID: {
881 SelectINTRINSIC_VOID(N);
882 return;
883 }
885 SelectWAVE_ADDRESS(N);
886 return;
887 }
888 case ISD::STACKRESTORE: {
889 SelectSTACKRESTORE(N);
890 return;
891 }
892 }
893
894 SelectCode(N);
895}
896
898 if (!Subtarget->hasSDWA())
899 return false;
900
901 if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
902 EVT VT = cast<VTSDNode>(N->getOperand(1))->getVT();
903 return VT.getScalarSizeInBits() == 8 || VT.getScalarSizeInBits() == 16;
904 }
905
906 if (N->getOpcode() == ISD::AND)
907 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
908 return RHS->getZExtValue() == 0xFF || RHS->getZExtValue() == 0xFFFF;
909
910 if (N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL)
911 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
912 return (RHS->getZExtValue() % 8) == 0;
913
914 return false;
915}
916
917bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
918 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
919 const Instruction *Term = BB->getTerminator();
920 return Term->getMetadata("amdgpu.uniform") ||
921 Term->getMetadata("structurizecfg.uniform");
922}
923
924bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
925 unsigned ShAmtBits) const {
926 assert(N->getOpcode() == ISD::AND);
927
928 const APInt &RHS = N->getConstantOperandAPInt(1);
929 if (RHS.countr_one() >= ShAmtBits)
930 return true;
931
932 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
933 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
934}
935
937 SDValue &N0, SDValue &N1) {
938 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
940 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
941 // (i64 (bitcast (v2i32 (build_vector
942 // (or (extract_vector_elt V, 0), OFFSET),
943 // (extract_vector_elt V, 1)))))
944 SDValue Lo = Addr.getOperand(0).getOperand(0);
945 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
946 SDValue BaseLo = Lo.getOperand(0);
947 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
948 // Check that split base (Lo and Hi) are extracted from the same one.
949 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
951 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
952 // Lo is statically extracted from index 0.
953 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
954 BaseLo.getConstantOperandVal(1) == 0 &&
955 // Hi is statically extracted from index 0.
956 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
957 BaseHi.getConstantOperandVal(1) == 1) {
958 N0 = BaseLo.getOperand(0).getOperand(0);
959 N1 = Lo.getOperand(1);
960 return true;
961 }
962 }
963 }
964 return false;
965}
966
967bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
968 SDValue &RHS) const {
969 if (CurDAG->isBaseWithConstantOffset(Addr)) {
970 LHS = Addr.getOperand(0);
971 RHS = Addr.getOperand(1);
972 return true;
973 }
974
977 return true;
978 }
979
980 return false;
981}
982
984 return "AMDGPU DAG->DAG Pattern Instruction Selection";
985}
986
990
994#ifdef EXPENSIVE_CHECKS
996 .getManager();
997 auto &F = MF.getFunction();
998 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
999 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
1000 for (auto &L : LI.getLoopsInPreorder())
1001 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
1002#endif
1003 return SelectionDAGISelPass::run(MF, MFAM);
1004}
1005
1006//===----------------------------------------------------------------------===//
1007// Complex Patterns
1008//===----------------------------------------------------------------------===//
1009
1010bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
1011 SDValue &Offset) {
1012 return false;
1013}
1014
1015bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
1016 SDValue &Offset) {
1018 SDLoc DL(Addr);
1019
1020 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
1021 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1022 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1023 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
1024 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
1025 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1026 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1027 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
1028 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
1029 Base = Addr.getOperand(0);
1030 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1031 } else {
1032 Base = Addr;
1033 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1034 }
1035
1036 return true;
1037}
1038
1039SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1040 const SDLoc &DL) const {
1041 SDNode *Mov = CurDAG->getMachineNode(
1042 AMDGPU::S_MOV_B32, DL, MVT::i32,
1043 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1044 return SDValue(Mov, 0);
1045}
1046
1047void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1048 SDValue LHS = N->getOperand(0);
1049 SDValue RHS = N->getOperand(1);
1050 SDValue CI = N->getOperand(2);
1051
1052 if (N->isDivergent()) {
1053 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1054 : AMDGPU::V_SUBB_U32_e64;
1055 CurDAG->SelectNodeTo(
1056 N, Opc, N->getVTList(),
1057 {LHS, RHS, CI,
1058 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1059 } else {
1060 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1061 : AMDGPU::S_SUB_CO_PSEUDO;
1062 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1063 }
1064}
1065
1066void AMDGPUDAGToDAGISel::SelectAddcSubbI64(SDNode *N) {
1067 SDLoc DL(N);
1068 SDValue LHS = N->getOperand(0);
1069 SDValue RHS = N->getOperand(1);
1070
1071 unsigned Opcode = N->getOpcode();
1072 bool ConsumeCarry = Opcode == ISD::UADDO_CARRY || Opcode == ISD::USUBO_CARRY;
1073 bool IsAdd = Opcode == ISD::UADDO || Opcode == ISD::UADDO_CARRY;
1074
1075 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1076 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1077
1078 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1079 MVT::i32, LHS, Sub0);
1080 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1081 MVT::i32, LHS, Sub1);
1082
1083 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1084 MVT::i32, RHS, Sub0);
1085 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1086 MVT::i32, RHS, Sub1);
1087
1088 SDVTList VTList = CurDAG->getVTList(MVT::i32, N->getValueType(1));
1089
1090 static const unsigned NoCarryOpcMap[2][2] = {
1091 {AMDGPU::S_USUBO_PSEUDO, AMDGPU::S_UADDO_PSEUDO},
1092 {AMDGPU::V_SUB_CO_U32_e64, AMDGPU::V_ADD_CO_U32_e64}};
1093 static const unsigned CarryOpcMap[2][2] = {
1094 {AMDGPU::S_SUB_CO_PSEUDO, AMDGPU::S_ADD_CO_PSEUDO},
1095 {AMDGPU::V_SUBB_U32_e64, AMDGPU::V_ADDC_U32_e64}};
1096
1097 bool IsVALU = N->isDivergent();
1098
1099 unsigned NoCarryOpc = NoCarryOpcMap[IsVALU][IsAdd];
1100 unsigned CarryOpc = CarryOpcMap[IsVALU][IsAdd];
1101 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1102
1103 SDNode *AddLo;
1104 if (!ConsumeCarry) {
1105 if (IsVALU) {
1106 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), Clamp};
1107 AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);
1108 } else {
1109 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0)};
1110 AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);
1111 }
1112 } else {
1113 if (IsVALU) {
1114 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2),
1115 Clamp};
1116 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1117 } else {
1118 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2)};
1119 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1120 }
1121 }
1122
1123 SDNode *AddHi;
1124 if (IsVALU) {
1125 SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1),
1126 Clamp};
1127 AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1128 } else {
1129 SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1)};
1130 AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1131 }
1132
1133 unsigned RC = IsVALU ? AMDGPU::VReg_64RegClassID : AMDGPU::SReg_64RegClassID;
1134 SDValue RegSequenceArgs[] = {CurDAG->getTargetConstant(RC, DL, MVT::i32),
1135 SDValue(AddLo, 0), Sub0, SDValue(AddHi, 0),
1136 Sub1};
1137 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1138 MVT::i64, RegSequenceArgs);
1139
1140 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1141 ReplaceNode(N, RegSequence);
1142}
1143
1144void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1145 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1146 // carry out despite the _i32 name. These were renamed in VI to _U32.
1147 // FIXME: We should probably rename the opcodes here.
1148 bool IsAdd = N->getOpcode() == ISD::UADDO;
1149 bool IsVALU = N->isDivergent();
1150
1151 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1152 ++UI)
1153 if (UI.getUse().getResNo() == 1) {
1154 if (UI->isMachineOpcode()) {
1155 if (UI->getMachineOpcode() !=
1156 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1157 IsVALU = true;
1158 break;
1159 }
1160 } else {
1161 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1162 IsVALU = true;
1163 break;
1164 }
1165 }
1166 }
1167
1168 if (IsVALU) {
1169 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1170
1171 CurDAG->SelectNodeTo(
1172 N, Opc, N->getVTList(),
1173 {N->getOperand(0), N->getOperand(1),
1174 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1175 } else {
1176 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1177
1178 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1179 {N->getOperand(0), N->getOperand(1)});
1180 }
1181}
1182
1183void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1184 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1185 SDValue Ops[10];
1186
1187 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1188 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1189 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1190 Ops[8] = N->getOperand(0);
1191 Ops[9] = N->getOperand(4);
1192
1193 // If there are no source modifiers, prefer fmac over fma because it can use
1194 // the smaller VOP2 encoding.
1195 bool UseFMAC = Subtarget->hasDLInsts() &&
1196 cast<ConstantSDNode>(Ops[0])->isZero() &&
1197 cast<ConstantSDNode>(Ops[2])->isZero() &&
1198 cast<ConstantSDNode>(Ops[4])->isZero();
1199 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1200 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1201}
1202
1203void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1204 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1205 SDValue Ops[8];
1206
1207 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1208 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1209 Ops[6] = N->getOperand(0);
1210 Ops[7] = N->getOperand(3);
1211
1212 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1213}
1214
1215// We need to handle this here because tablegen doesn't support matching
1216// instructions with multiple outputs.
1217void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1218 EVT VT = N->getValueType(0);
1219
1220 assert(VT == MVT::f32 || VT == MVT::f64);
1221
1222 unsigned Opc
1223 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1224
1225 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1226 // omod
1227 SDValue Ops[8];
1228 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1229 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1230 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1231 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1232}
1233
1234// We need to handle this here because tablegen doesn't support matching
1235// instructions with multiple outputs.
1236void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1237 SDLoc SL(N);
1238 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1239 unsigned Opc;
1240 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() && !N->hasAnyUseOfValue(1);
1241 if (Subtarget->hasMADIntraFwdBug())
1242 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1243 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1244 else if (UseNoCarry)
1245 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1246 else
1247 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1248
1249 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1250 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1251 Clamp };
1252
1253 if (UseNoCarry) {
1254 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1255 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1256 CurDAG->RemoveDeadNode(N);
1257 return;
1258 }
1259
1260 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1261}
1262
1263// We need to handle this here because tablegen doesn't support matching
1264// instructions with multiple outputs.
1265void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1266 SDLoc SL(N);
1267 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1268 SDVTList VTList;
1269 unsigned Opc;
1270 if (Subtarget->hasMadNC64_32Insts()) {
1271 VTList = CurDAG->getVTList(MVT::i64);
1272 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1273 } else {
1274 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1275 if (Subtarget->hasMADIntraFwdBug()) {
1276 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1277 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1278 } else {
1279 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1280 }
1281 }
1282
1283 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1284 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1285 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1286 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1287 if (!SDValue(N, 0).use_empty()) {
1288 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1289 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1290 MVT::i32, SDValue(Mad, 0), Sub0);
1291 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1292 }
1293 if (!SDValue(N, 1).use_empty()) {
1294 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1295 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1296 MVT::i32, SDValue(Mad, 0), Sub1);
1297 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1298 }
1299 CurDAG->RemoveDeadNode(N);
1300}
1301
1302bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1303 if (!isUInt<16>(Offset))
1304 return false;
1305
1306 if (!Base || Subtarget->hasUsableDSOffset() ||
1307 Subtarget->unsafeDSOffsetFoldingEnabled())
1308 return true;
1309
1310 // On Southern Islands instruction with a negative base value and an offset
1311 // don't seem to work.
1312 return CurDAG->SignBitIsZero(Base);
1313}
1314
1315bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1316 SDValue &Offset) const {
1317 SDLoc DL(Addr);
1318 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1319 SDValue N0 = Addr.getOperand(0);
1320 SDValue N1 = Addr.getOperand(1);
1321 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1322 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1323 // (add n0, c0)
1324 Base = N0;
1325 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1326 return true;
1327 }
1328 } else if (Addr.getOpcode() == ISD::SUB) {
1329 // sub C, x -> add (sub 0, x), C
1330 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1331 int64_t ByteOffset = C->getSExtValue();
1332 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1333 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1334
1335 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1336 // the known bits in isDSOffsetLegal. We need to emit the selected node
1337 // here, so this is thrown away.
1338 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1339 Zero, Addr.getOperand(1));
1340
1341 if (isDSOffsetLegal(Sub, ByteOffset)) {
1343 Opnds.push_back(Zero);
1344 Opnds.push_back(Addr.getOperand(1));
1345
1346 // FIXME: Select to VOP3 version for with-carry.
1347 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1348 if (Subtarget->hasAddNoCarryInsts()) {
1349 SubOp = AMDGPU::V_SUB_U32_e64;
1350 Opnds.push_back(
1351 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1352 }
1353
1354 MachineSDNode *MachineSub =
1355 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1356
1357 Base = SDValue(MachineSub, 0);
1358 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1359 return true;
1360 }
1361 }
1362 }
1363 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1364 // If we have a constant address, prefer to put the constant into the
1365 // offset. This can save moves to load the constant address since multiple
1366 // operations can share the zero base address register, and enables merging
1367 // into read2 / write2 instructions.
1368
1369 SDLoc DL(Addr);
1370
1371 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1372 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1373 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1374 DL, MVT::i32, Zero);
1375 Base = SDValue(MovZero, 0);
1376 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1377 return true;
1378 }
1379 }
1380
1381 // default case
1382 Base = Addr;
1383 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1384 return true;
1385}
1386
1387bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1388 unsigned Offset1,
1389 unsigned Size) const {
1390 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1391 return false;
1392 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1393 return false;
1394
1395 if (!Base || Subtarget->hasUsableDSOffset() ||
1396 Subtarget->unsafeDSOffsetFoldingEnabled())
1397 return true;
1398
1399 // On Southern Islands instruction with a negative base value and an offset
1400 // don't seem to work.
1401 return CurDAG->SignBitIsZero(Base);
1402}
1403
1404// Return whether the operation has NoUnsignedWrap property.
1405static bool isNoUnsignedWrap(SDValue Addr) {
1406 return (Addr.getOpcode() == ISD::ADD &&
1407 Addr->getFlags().hasNoUnsignedWrap()) ||
1408 Addr->getOpcode() == ISD::OR;
1409}
1410
1411// Check that the base address of flat scratch load/store in the form of `base +
1412// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1413// requirement). We always treat the first operand as the base address here.
1414bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1415 if (isNoUnsignedWrap(Addr))
1416 return true;
1417
1418 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1419 // values.
1420 if (Subtarget->hasSignedScratchOffsets())
1421 return true;
1422
1423 auto LHS = Addr.getOperand(0);
1424 auto RHS = Addr.getOperand(1);
1425
1426 // If the immediate offset is negative and within certain range, the base
1427 // address cannot also be negative. If the base is also negative, the sum
1428 // would be either negative or much larger than the valid range of scratch
1429 // memory a thread can access.
1430 ConstantSDNode *ImmOp = nullptr;
1431 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1432 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1433 return true;
1434 }
1435
1436 return CurDAG->SignBitIsZero(LHS);
1437}
1438
1439// Check address value in SGPR/VGPR are legal for flat scratch in the form
1440// of: SGPR + VGPR.
1441bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1442 if (isNoUnsignedWrap(Addr))
1443 return true;
1444
1445 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1446 // values.
1447 if (Subtarget->hasSignedScratchOffsets())
1448 return true;
1449
1450 auto LHS = Addr.getOperand(0);
1451 auto RHS = Addr.getOperand(1);
1452 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1453}
1454
1455// Check address value in SGPR/VGPR are legal for flat scratch in the form
1456// of: SGPR + VGPR + Imm.
1457bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1458 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1459 // values.
1460 if (AMDGPU::isGFX12Plus(*Subtarget))
1461 return true;
1462
1463 auto Base = Addr.getOperand(0);
1464 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1465 // If the immediate offset is negative and within certain range, the base
1466 // address cannot also be negative. If the base is also negative, the sum
1467 // would be either negative or much larger than the valid range of scratch
1468 // memory a thread can access.
1469 if (isNoUnsignedWrap(Base) &&
1470 (isNoUnsignedWrap(Addr) ||
1471 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1472 return true;
1473
1474 auto LHS = Base.getOperand(0);
1475 auto RHS = Base.getOperand(1);
1476 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1477}
1478
1479// TODO: If offset is too big, put low 16-bit into offset.
1480bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1481 SDValue &Offset0,
1482 SDValue &Offset1) const {
1483 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1484}
1485
1486bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1487 SDValue &Offset0,
1488 SDValue &Offset1) const {
1489 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1490}
1491
1492bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1493 SDValue &Offset0, SDValue &Offset1,
1494 unsigned Size) const {
1495 SDLoc DL(Addr);
1496
1497 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1498 SDValue N0 = Addr.getOperand(0);
1499 SDValue N1 = Addr.getOperand(1);
1500 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1501 unsigned OffsetValue0 = C1->getZExtValue();
1502 unsigned OffsetValue1 = OffsetValue0 + Size;
1503
1504 // (add n0, c0)
1505 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1506 Base = N0;
1507 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1508 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1509 return true;
1510 }
1511 } else if (Addr.getOpcode() == ISD::SUB) {
1512 // sub C, x -> add (sub 0, x), C
1513 if (const ConstantSDNode *C =
1515 unsigned OffsetValue0 = C->getZExtValue();
1516 unsigned OffsetValue1 = OffsetValue0 + Size;
1517
1518 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1519 SDLoc DL(Addr);
1520 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1521
1522 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1523 // the known bits in isDSOffsetLegal. We need to emit the selected node
1524 // here, so this is thrown away.
1525 SDValue Sub =
1526 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1527
1528 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1530 Opnds.push_back(Zero);
1531 Opnds.push_back(Addr.getOperand(1));
1532 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1533 if (Subtarget->hasAddNoCarryInsts()) {
1534 SubOp = AMDGPU::V_SUB_U32_e64;
1535 Opnds.push_back(
1536 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1537 }
1538
1539 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1540 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1541
1542 Base = SDValue(MachineSub, 0);
1543 Offset0 =
1544 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1545 Offset1 =
1546 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1547 return true;
1548 }
1549 }
1550 }
1551 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1552 unsigned OffsetValue0 = CAddr->getZExtValue();
1553 unsigned OffsetValue1 = OffsetValue0 + Size;
1554
1555 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1556 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1557 MachineSDNode *MovZero =
1558 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1559 Base = SDValue(MovZero, 0);
1560 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1561 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1562 return true;
1563 }
1564 }
1565
1566 // default case
1567
1568 Base = Addr;
1569 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1570 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1571 return true;
1572}
1573
1574bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1575 SDValue &SOffset, SDValue &Offset,
1576 SDValue &Offen, SDValue &Idxen,
1577 SDValue &Addr64) const {
1578 // Subtarget prefers to use flat instruction
1579 // FIXME: This should be a pattern predicate and not reach here
1580 if (Subtarget->useFlatForGlobal())
1581 return false;
1582
1583 SDLoc DL(Addr);
1584
1585 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1586 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1587 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1588 SOffset = Subtarget->hasRestrictedSOffset()
1589 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1590 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1591
1592 ConstantSDNode *C1 = nullptr;
1593 SDValue N0 = Addr;
1594 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1595 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1596 if (isUInt<32>(C1->getZExtValue()))
1597 N0 = Addr.getOperand(0);
1598 else
1599 C1 = nullptr;
1600 }
1601
1602 if (N0->isAnyAdd()) {
1603 // (add N2, N3) -> addr64, or
1604 // (add (add N2, N3), C1) -> addr64
1605 SDValue N2 = N0.getOperand(0);
1606 SDValue N3 = N0.getOperand(1);
1607 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1608
1609 if (N2->isDivergent()) {
1610 if (N3->isDivergent()) {
1611 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1612 // addr64, and construct the resource from a 0 address.
1613 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1614 VAddr = N0;
1615 } else {
1616 // N2 is divergent, N3 is not.
1617 Ptr = N3;
1618 VAddr = N2;
1619 }
1620 } else {
1621 // N2 is not divergent.
1622 Ptr = N2;
1623 VAddr = N3;
1624 }
1625 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1626 } else if (N0->isDivergent()) {
1627 // N0 is divergent. Use it as the addr64, and construct the resource from a
1628 // 0 address.
1629 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1630 VAddr = N0;
1631 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1632 } else {
1633 // N0 -> offset, or
1634 // (N0 + C1) -> offset
1635 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1636 Ptr = N0;
1637 }
1638
1639 if (!C1) {
1640 // No offset.
1641 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1642 return true;
1643 }
1644
1645 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1646 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1647 // Legal offset for instruction.
1648 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1649 return true;
1650 }
1651
1652 // Illegal offset, store it in soffset.
1653 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1654 SOffset =
1655 SDValue(CurDAG->getMachineNode(
1656 AMDGPU::S_MOV_B32, DL, MVT::i32,
1657 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1658 0);
1659 return true;
1660}
1661
1662bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1663 SDValue &VAddr, SDValue &SOffset,
1664 SDValue &Offset) const {
1665 SDValue Ptr, Offen, Idxen, Addr64;
1666
1667 // addr64 bit was removed for volcanic islands.
1668 // FIXME: This should be a pattern predicate and not reach here
1669 if (!Subtarget->hasAddr64())
1670 return false;
1671
1672 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1673 return false;
1674
1675 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1676 if (C->getSExtValue()) {
1677 SDLoc DL(Addr);
1678
1679 const SITargetLowering& Lowering =
1680 *static_cast<const SITargetLowering*>(getTargetLowering());
1681
1682 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1683 return true;
1684 }
1685
1686 return false;
1687}
1688
1689std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1690 SDLoc DL(N);
1691
1692 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1693 SDValue TFI =
1694 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1695
1696 // We rebase the base address into an absolute stack address and hence
1697 // use constant 0 for soffset. This value must be retained until
1698 // frame elimination and eliminateFrameIndex will choose the appropriate
1699 // frame register if need be.
1700 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1701}
1702
1703bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1704 SDValue Addr, SDValue &Rsrc,
1705 SDValue &VAddr, SDValue &SOffset,
1706 SDValue &ImmOffset) const {
1707
1708 SDLoc DL(Addr);
1709 MachineFunction &MF = CurDAG->getMachineFunction();
1710 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1711
1712 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1713
1714 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1715 int64_t Imm = CAddr->getSExtValue();
1716 const int64_t NullPtr =
1718 // Don't fold null pointer.
1719 if (Imm != NullPtr) {
1720 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1721 SDValue HighBits =
1722 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1723 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1724 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1725 VAddr = SDValue(MovHighBits, 0);
1726
1727 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1728 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1729 return true;
1730 }
1731 }
1732
1733 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1734 // (add n0, c1)
1735
1736 SDValue N0 = Addr.getOperand(0);
1737 uint64_t C1 = Addr.getConstantOperandVal(1);
1738
1739 // Offsets in vaddr must be positive if range checking is enabled.
1740 //
1741 // The total computation of vaddr + soffset + offset must not overflow. If
1742 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1743 // overflowing.
1744 //
1745 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1746 // always perform a range check. If a negative vaddr base index was used,
1747 // this would fail the range check. The overall address computation would
1748 // compute a valid address, but this doesn't happen due to the range
1749 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1750 //
1751 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1752 // MUBUF vaddr, but not on older subtargets which can only do this if the
1753 // sign bit is known 0.
1754 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1755 if (TII->isLegalMUBUFImmOffset(C1) &&
1756 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1757 CurDAG->SignBitIsZero(N0))) {
1758 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1759 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1760 return true;
1761 }
1762 }
1763
1764 // (node)
1765 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1766 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1767 return true;
1768}
1769
1770static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1771 if (Val.getOpcode() != ISD::CopyFromReg)
1772 return false;
1773 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1774 if (!Reg.isPhysical())
1775 return false;
1776 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1777 return RC && TRI.isSGPRClass(RC);
1778}
1779
1780bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1781 SDValue Addr,
1782 SDValue &SRsrc,
1783 SDValue &SOffset,
1784 SDValue &Offset) const {
1785 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1786 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1787 MachineFunction &MF = CurDAG->getMachineFunction();
1788 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1789 SDLoc DL(Addr);
1790
1791 // CopyFromReg <sgpr>
1792 if (IsCopyFromSGPR(*TRI, Addr)) {
1793 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1794 SOffset = Addr;
1795 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1796 return true;
1797 }
1798
1799 ConstantSDNode *CAddr;
1800 if (Addr.getOpcode() == ISD::ADD) {
1801 // Add (CopyFromReg <sgpr>) <constant>
1802 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1803 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1804 return false;
1805 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1806 return false;
1807
1808 SOffset = Addr.getOperand(0);
1809 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1810 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1811 // <constant>
1812 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1813 } else {
1814 return false;
1815 }
1816
1817 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1818
1819 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1820 return true;
1821}
1822
1823bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1824 SDValue &SOffset, SDValue &Offset
1825 ) const {
1826 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1827 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1828
1829 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1830 return false;
1831
1832 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1833 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1834 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1835 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1836 maskTrailingOnes<uint64_t>(32); // Size
1837 SDLoc DL(Addr);
1838
1839 const SITargetLowering& Lowering =
1840 *static_cast<const SITargetLowering*>(getTargetLowering());
1841
1842 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1843 return true;
1844 }
1845 return false;
1846}
1847
1848bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1849 SDValue &SOffset) const {
1850 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1851 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1852 return true;
1853 }
1854
1855 SOffset = ByteOffsetNode;
1856 return true;
1857}
1858
1859// Find a load or store from corresponding pattern root.
1860// Roots may be build_vector, bitconvert or their combinations.
1863 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1864 return MN;
1866 for (SDValue V : N->op_values())
1867 if (MemSDNode *MN =
1869 return MN;
1870 llvm_unreachable("cannot find MemSDNode in the pattern!");
1871}
1872
1873bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(
1874 SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset,
1875 AMDGPU::FlatAddrSpace FlatVariant) const {
1877 int64_t OffsetVal = 0;
1878
1879 unsigned AS = findMemSDNode(N)->getAddressSpace();
1880
1881 bool CanHaveFlatSegmentOffsetBug =
1882 Subtarget->hasFlatSegmentOffsetBug() &&
1883 FlatVariant == FlatAddrSpace::FLAT &&
1885
1886 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1887 SDValue N0, N1;
1888 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1889 (FlatVariant != FlatAddrSpace::FlatScratch ||
1890 isFlatScratchBaseLegal(Addr))) {
1891 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1892
1893 // Adding the offset to the base address in a FLAT instruction must not
1894 // change the memory aperture in which the address falls. Therefore we can
1895 // only fold offsets from inbounds GEPs into FLAT instructions.
1896 bool IsInBounds =
1897 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1898 if (COffsetVal == 0 || FlatVariant != FlatAddrSpace::FLAT || IsInBounds) {
1899 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1900 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1901 Addr = N0;
1902 OffsetVal = COffsetVal;
1903 } else {
1904 // If the offset doesn't fit, put the low bits into the offset field
1905 // and add the rest.
1906 //
1907 // For a FLAT instruction the hardware decides whether to access
1908 // global/scratch/shared memory based on the high bits of vaddr,
1909 // ignoring the offset field, so we have to ensure that when we add
1910 // remainder to vaddr it still points into the same underlying object.
1911 // The easiest way to do that is to make sure that we split the offset
1912 // into two pieces that are both >= 0 or both <= 0.
1913
1914 SDLoc DL(N);
1915 uint64_t RemainderOffset;
1916
1917 std::tie(OffsetVal, RemainderOffset) =
1918 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1919
1920 SDValue AddOffsetLo =
1921 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1922 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1923
1924 if (Addr.getValueType().getSizeInBits() == 32) {
1926 Opnds.push_back(N0);
1927 Opnds.push_back(AddOffsetLo);
1928 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1929 if (Subtarget->hasAddNoCarryInsts()) {
1930 AddOp = AMDGPU::V_ADD_U32_e64;
1931 Opnds.push_back(Clamp);
1932 }
1933 Addr =
1934 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1935 } else {
1936 // TODO: Should this try to use a scalar add pseudo if the base
1937 // address is uniform and saddr is usable?
1938 SDValue Sub0 =
1939 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1940 SDValue Sub1 =
1941 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1942
1943 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1944 DL, MVT::i32, N0, Sub0);
1945 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1946 DL, MVT::i32, N0, Sub1);
1947
1948 SDValue AddOffsetHi =
1949 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1950
1951 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1952
1953 SDNode *Add =
1954 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1955 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1956
1957 SDNode *Addc = CurDAG->getMachineNode(
1958 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1959 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1960
1961 SDValue RegSequenceArgs[] = {
1962 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1963 MVT::i32),
1964 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1965
1966 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1967 MVT::i64, RegSequenceArgs),
1968 0);
1969 }
1970 }
1971 }
1972 }
1973 }
1974
1975 VAddr = Addr;
1976 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1977 return true;
1978}
1979
1980bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1981 SDValue &VAddr,
1982 SDValue &Offset) const {
1983 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1985}
1986
1987bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1988 SDValue &VAddr,
1989 SDValue &Offset) const {
1990 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1992}
1993
1994bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1995 SDValue &VAddr,
1996 SDValue &Offset) const {
1997 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1999}
2000
2001// If this matches *_extend i32:x, return x
2002// Otherwise if the value is I32 returns x.
2004 const SelectionDAG *DAG) {
2005 if (Op.getValueType() == MVT::i32)
2006 return Op;
2007
2008 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
2009 Op.getOpcode() != ISD::ANY_EXTEND &&
2010 !(DAG->SignBitIsZero(Op) &&
2011 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
2012 return SDValue();
2013
2014 SDValue ExtSrc = Op.getOperand(0);
2015 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
2016}
2017
2018// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
2019// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
2020bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2021 SDValue &SAddr, SDValue &VOffset,
2022 SDValue &Offset, bool &ScaleOffset,
2023 bool NeedIOffset) const {
2025 int64_t ImmOffset = 0;
2026 ScaleOffset = false;
2027
2028 // Match the immediate offset first, which canonically is moved as low as
2029 // possible.
2030
2031 SDValue LHS, RHS;
2032 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2033 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2034 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2035
2036 if (NeedIOffset &&
2037 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
2038 FlatAddrSpace::FlatGlobal)) {
2039 Addr = LHS;
2040 ImmOffset = COffsetVal;
2041 } else if (!LHS->isDivergent()) {
2042 if (COffsetVal > 0) {
2043 SDLoc SL(N);
2044 // saddr + large_offset -> saddr +
2045 // (voffset = large_offset & ~MaxOffset) +
2046 // (large_offset & MaxOffset);
2047 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2048 if (NeedIOffset) {
2049 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2050 COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, FlatAddrSpace::FlatGlobal);
2051 }
2052
2053 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2054 : isUInt<32>(RemainderOffset)) {
2055 SDNode *VMov = CurDAG->getMachineNode(
2056 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2057 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2058 VOffset = SDValue(VMov, 0);
2059 SAddr = LHS;
2060 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2061 return true;
2062 }
2063 }
2064
2065 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2066 // is 1 we would need to perform 1 or 2 extra moves for each half of
2067 // the constant and it is better to do a scalar add and then issue a
2068 // single VALU instruction to materialize zero. Otherwise it is less
2069 // instructions to perform VALU adds with immediates or inline literals.
2070 unsigned NumLiterals =
2071 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2072 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2073 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2074 return false;
2075 }
2076 }
2077
2078 // Match the variable offset.
2079 if (Addr->isAnyAdd()) {
2080 LHS = Addr.getOperand(0);
2081
2082 if (!LHS->isDivergent()) {
2083 // add (i64 sgpr), (*_extend (i32 vgpr))
2084 RHS = Addr.getOperand(1);
2085 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2086 if (SDValue ExtRHS = matchExtFromI32orI32(
2087 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2088 SAddr = LHS;
2089 VOffset = ExtRHS;
2090 }
2091 }
2092
2093 RHS = Addr.getOperand(1);
2094 if (!SAddr && !RHS->isDivergent()) {
2095 // add (*_extend (i32 vgpr)), (i64 sgpr)
2096 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2097 if (SDValue ExtLHS = matchExtFromI32orI32(
2098 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2099 SAddr = RHS;
2100 VOffset = ExtLHS;
2101 }
2102 }
2103
2104 if (SAddr) {
2105 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2106 return true;
2107 }
2108 }
2109
2110 if (Subtarget->hasScaleOffset() &&
2111 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2114 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2115 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2116 Addr.getOperand(0)->isDivergent() &&
2118 !Addr.getOperand(2)->isDivergent()) {
2119 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2120 unsigned Size =
2121 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2122 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2123 if (ScaleOffset) {
2124 SAddr = Addr.getOperand(2);
2125 VOffset = Addr.getOperand(0);
2126 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2127 return true;
2128 }
2129 }
2130
2131 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2132 isa<ConstantSDNode>(Addr))
2133 return false;
2134
2135 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2136 // moves required to copy a 64-bit SGPR to VGPR.
2137 SAddr = Addr;
2138 SDNode *VMov =
2139 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2140 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2141 VOffset = SDValue(VMov, 0);
2142 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2143 return true;
2144}
2145
2146bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2147 SDValue &SAddr, SDValue &VOffset,
2148 SDValue &Offset,
2149 SDValue &CPol) const {
2150 bool ScaleOffset;
2151 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2152 return false;
2153
2154 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2155 SDLoc(), MVT::i32);
2156 return true;
2157}
2158
2159bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2160 SDValue &SAddr, SDValue &VOffset,
2161 SDValue &Offset,
2162 SDValue &CPol) const {
2163 bool ScaleOffset;
2164 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2165 return false;
2166
2167 // We are assuming CPol is always the last operand of the intrinsic.
2168 auto PassedCPol =
2169 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2170 CPol = CurDAG->getTargetConstant(
2171 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2172 return true;
2173}
2174
2175bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2176 SDValue &SAddr,
2177 SDValue &VOffset,
2178 SDValue &Offset,
2179 SDValue &CPol) const {
2180 bool ScaleOffset;
2181 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2182 return false;
2183
2184 // We are assuming CPol is second from last operand of the intrinsic.
2185 auto PassedCPol =
2186 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2187 CPol = CurDAG->getTargetConstant(
2188 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2189 return true;
2190}
2191
2192bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2193 SDValue &SAddr, SDValue &VOffset,
2194 SDValue &Offset,
2195 SDValue &CPol) const {
2196 bool ScaleOffset;
2197 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2198 return false;
2199
2200 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2201 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2202 return true;
2203}
2204
2205bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2206 SDValue &SAddr,
2207 SDValue &VOffset,
2208 SDValue &CPol) const {
2209 bool ScaleOffset;
2210 SDValue DummyOffset;
2211 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2212 false))
2213 return false;
2214
2215 // We are assuming CPol is always the last operand of the intrinsic.
2216 auto PassedCPol =
2217 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2218 CPol = CurDAG->getTargetConstant(
2219 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2220 return true;
2221}
2222
2223bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2224 SDValue &SAddr,
2225 SDValue &VOffset,
2226 SDValue &CPol) const {
2227 bool ScaleOffset;
2228 SDValue DummyOffset;
2229 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2230 false))
2231 return false;
2232
2233 // We are assuming CPol is second from last operand of the intrinsic.
2234 auto PassedCPol =
2235 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2236 CPol = CurDAG->getTargetConstant(
2237 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2238 return true;
2239}
2240
2242 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2243 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2244 } else if (SAddr.getOpcode() == ISD::ADD &&
2246 // Materialize this into a scalar move for scalar address to avoid
2247 // readfirstlane.
2248 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2249 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2250 FI->getValueType(0));
2251 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2252 MVT::i32, TFI, SAddr.getOperand(1)),
2253 0);
2254 }
2255
2256 return SAddr;
2257}
2258
2259// Match (32-bit SGPR base) + sext(imm offset)
2260bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2261 SDValue &SAddr,
2262 SDValue &Offset) const {
2264 if (Addr->isDivergent())
2265 return false;
2266
2267 SDLoc DL(Addr);
2268
2269 int64_t COffsetVal = 0;
2270
2271 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2272 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2273 SAddr = Addr.getOperand(0);
2274 } else {
2275 SAddr = Addr;
2276 }
2277
2278 SAddr = SelectSAddrFI(CurDAG, SAddr);
2279
2280 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2281
2282 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2283 FlatAddrSpace::FlatScratch)) {
2284 int64_t SplitImmOffset, RemainderOffset;
2285 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2286 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, FlatAddrSpace::FlatScratch);
2287
2288 COffsetVal = SplitImmOffset;
2289
2290 SDValue AddOffset =
2292 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2293 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2294 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2295 SAddr, AddOffset),
2296 0);
2297 }
2298
2299 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2300
2301 return true;
2302}
2303
2304// Check whether the flat scratch SVS swizzle bug affects this access.
2305bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2306 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2307 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2308 return false;
2309
2310 // The bug affects the swizzling of SVS accesses if there is any carry out
2311 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2312 // voffset to (soffset + inst_offset).
2313 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2314 KnownBits SKnown =
2315 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2316 KnownBits::makeConstant(APInt(32, ImmOffset,
2317 /*isSigned=*/true)));
2318 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2319 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2320 return (VMax & 3) + (SMax & 3) >= 4;
2321}
2322
2323bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2324 SDValue &VAddr, SDValue &SAddr,
2325 SDValue &Offset,
2326 SDValue &CPol) const {
2327 int64_t ImmOffset = 0;
2328
2329 SDValue LHS, RHS;
2330 SDValue OrigAddr = Addr;
2331 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2332 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2333 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2334
2335 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2337 Addr = LHS;
2338 ImmOffset = COffsetVal;
2339 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2340 SDLoc SL(N);
2341 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2342 // (large_offset & MaxOffset);
2343 int64_t SplitImmOffset, RemainderOffset;
2344 std::tie(SplitImmOffset, RemainderOffset) =
2345 TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2347
2348 if (isUInt<32>(RemainderOffset)) {
2349 SDNode *VMov = CurDAG->getMachineNode(
2350 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2351 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2352 VAddr = SDValue(VMov, 0);
2353 SAddr = LHS;
2354 if (!isFlatScratchBaseLegal(Addr))
2355 return false;
2356 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2357 return false;
2358 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2359 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2360 return true;
2361 }
2362 }
2363 }
2364
2365 if (Addr.getOpcode() != ISD::ADD)
2366 return false;
2367
2368 LHS = Addr.getOperand(0);
2369 RHS = Addr.getOperand(1);
2370
2371 if (!LHS->isDivergent() && RHS->isDivergent()) {
2372 SAddr = LHS;
2373 VAddr = RHS;
2374 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2375 SAddr = RHS;
2376 VAddr = LHS;
2377 } else {
2378 return false;
2379 }
2380
2381 if (OrigAddr != Addr) {
2382 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2383 return false;
2384 } else {
2385 if (!isFlatScratchBaseLegalSV(OrigAddr))
2386 return false;
2387 }
2388
2389 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2390 return false;
2391 SAddr = SelectSAddrFI(CurDAG, SAddr);
2392 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2393
2394 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2395 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2396 SDLoc(), MVT::i32);
2397 return true;
2398}
2399
2400// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2401// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2402// Handle the case where the Immediate Offset + SOffset is negative.
2403bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2404 bool Imm32Only,
2405 bool IsBuffer,
2406 int64_t ImmOffset) const {
2407 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2408 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2409 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2410 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2411 return false;
2412 }
2413
2414 return true;
2415}
2416
2417// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2418// the load byte size. If it is update \p Offset to a pre-scaled value and
2419// return true.
2420bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2421 bool IsSigned) const {
2422 bool ScaleOffset = false;
2423 if (!Subtarget->hasScaleOffset() || !Offset)
2424 return false;
2425
2426 unsigned Size =
2427 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2428
2429 SDValue Off = Offset;
2430 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2431 Off = Ext;
2432
2433 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2434 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2435 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2436 } else if (Offset.getOpcode() == ISD::MUL ||
2437 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2438 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2439 (Offset.isMachineOpcode() &&
2440 Offset.getMachineOpcode() ==
2441 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2442 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2443 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2444 ScaleOffset = C->getZExtValue() == Size;
2445 }
2446
2447 if (ScaleOffset)
2448 Offset = Off.getOperand(0);
2449
2450 return ScaleOffset;
2451}
2452
2453// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2454// not null) offset. If Imm32Only is true, match only 32-bit immediate
2455// offsets available on CI.
2456bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2457 SDValue *SOffset, SDValue *Offset,
2458 bool Imm32Only, bool IsBuffer,
2459 bool HasSOffset, int64_t ImmOffset,
2460 bool *ScaleOffset) const {
2461 assert((!SOffset || !Offset) &&
2462 "Cannot match both soffset and offset at the same time!");
2463
2464 if (ScaleOffset) {
2465 assert(N && SOffset);
2466
2467 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2468 }
2469
2470 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2471 if (!C) {
2472 if (!SOffset)
2473 return false;
2474
2475 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2476 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2477 *SOffset = ByteOffsetNode;
2478 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2479 ImmOffset);
2480 }
2481 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2482 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2483 *SOffset = ByteOffsetNode.getOperand(0);
2484 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2485 ImmOffset);
2486 }
2487 }
2488 return false;
2489 }
2490
2491 SDLoc SL(ByteOffsetNode);
2492
2493 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2494 // offset for S_BUFFER instructions is unsigned.
2495 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2496 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2497 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2498 if (EncodedOffset && Offset && !Imm32Only) {
2499 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2500 return true;
2501 }
2502
2503 // SGPR and literal offsets are unsigned.
2504 if (ByteOffset < 0)
2505 return false;
2506
2507 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2508 if (EncodedOffset && Offset && Imm32Only) {
2509 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2510 return true;
2511 }
2512
2513 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2514 return false;
2515
2516 if (SOffset) {
2517 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2518 *SOffset = SDValue(
2519 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2520 return true;
2521 }
2522
2523 return false;
2524}
2525
2526SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2527 if (Addr.getValueType() != MVT::i32)
2528 return Addr;
2529
2530 // Zero-extend a 32-bit address.
2531 SDLoc SL(Addr);
2532
2533 const MachineFunction &MF = CurDAG->getMachineFunction();
2534 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2535 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2536 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2537
2538 const SDValue Ops[] = {
2539 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2540 Addr,
2541 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2542 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2543 0),
2544 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2545 };
2546
2547 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2548 Ops), 0);
2549}
2550
2551// Match a base and an immediate (if Offset is not null) or an SGPR (if
2552// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2553// true, match only 32-bit immediate offsets available on CI.
2554bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2555 SDValue &SBase, SDValue *SOffset,
2556 SDValue *Offset, bool Imm32Only,
2557 bool IsBuffer, bool HasSOffset,
2558 int64_t ImmOffset,
2559 bool *ScaleOffset) const {
2560 if (SOffset && Offset) {
2561 assert(!Imm32Only && !IsBuffer);
2562 SDValue B;
2563
2564 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2565 return false;
2566
2567 int64_t ImmOff = 0;
2568 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2569 ImmOff = C->getSExtValue();
2570
2571 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2572 true, ImmOff, ScaleOffset);
2573 }
2574
2575 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2576 // wraparound, because s_load instructions perform the addition in 64 bits.
2577 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2578 !Addr->getFlags().hasNoUnsignedWrap())
2579 return false;
2580
2581 SDValue N0, N1;
2582 // Extract the base and offset if possible.
2583 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2584 N0 = Addr.getOperand(0);
2585 N1 = Addr.getOperand(1);
2586 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2587 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2588 }
2589 if (!N0 || !N1)
2590 return false;
2591
2592 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2593 ImmOffset, ScaleOffset)) {
2594 SBase = N0;
2595 return true;
2596 }
2597 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2598 ImmOffset, ScaleOffset)) {
2599 SBase = N1;
2600 return true;
2601 }
2602 return false;
2603}
2604
2605bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2606 SDValue *SOffset, SDValue *Offset,
2607 bool Imm32Only, bool *ScaleOffset) const {
2608 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2609 /* IsBuffer */ false, /* HasSOffset */ false,
2610 /* ImmOffset */ 0, ScaleOffset)) {
2611 SBase = Expand32BitAddress(SBase);
2612 return true;
2613 }
2614
2615 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2616 SBase = Expand32BitAddress(Addr);
2617 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2618 return true;
2619 }
2620
2621 return false;
2622}
2623
2624bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2625 SDValue &Offset) const {
2626 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2627 &Offset);
2628}
2629
2630bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2631 SDValue &Offset) const {
2632 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2633 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2634 &Offset, /* Imm32Only */ true);
2635}
2636
2637bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2638 SDValue &SOffset, SDValue &CPol) const {
2639 bool ScaleOffset;
2640 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2641 /* Imm32Only */ false, &ScaleOffset))
2642 return false;
2643
2644 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2645 SDLoc(N), MVT::i32);
2646 return true;
2647}
2648
2649bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2650 SDValue &SBase, SDValue &SOffset,
2651 SDValue &Offset,
2652 SDValue &CPol) const {
2653 bool ScaleOffset;
2654 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2655 return false;
2656
2657 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2658 SDLoc(N), MVT::i32);
2659 return true;
2660}
2661
2662bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2663 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2664 /* Imm32Only */ false, /* IsBuffer */ true);
2665}
2666
2667bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2668 SDValue &Offset) const {
2669 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2670 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2671 /* Imm32Only */ true, /* IsBuffer */ true);
2672}
2673
2674bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2675 SDValue &Offset) const {
2676 // Match the (soffset + offset) pair as a 32-bit register base and
2677 // an immediate offset.
2678 return N.getValueType() == MVT::i32 &&
2679 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2680 /* SOffset*/ nullptr, &Offset,
2681 /* Imm32Only */ false, /* IsBuffer */ true);
2682}
2683
2684bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2685 SDValue &Base,
2686 SDValue &Offset) const {
2687 SDLoc DL(Index);
2688
2689 if (CurDAG->isBaseWithConstantOffset(Index)) {
2690 SDValue N0 = Index.getOperand(0);
2691 SDValue N1 = Index.getOperand(1);
2692 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2693
2694 // (add n0, c0)
2695 // Don't peel off the offset (c0) if doing so could possibly lead
2696 // the base (n0) to be negative.
2697 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2698 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2699 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2700 Base = N0;
2701 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2702 return true;
2703 }
2704 }
2705
2706 if (isa<ConstantSDNode>(Index))
2707 return false;
2708
2709 Base = Index;
2710 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2711 return true;
2712}
2713
2714SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2715 SDValue Val, uint32_t Offset,
2716 uint32_t Width) {
2717 if (Val->isDivergent()) {
2718 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2719 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2720 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2721
2722 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2723 }
2724 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2725 // Transformation function, pack the offset and width of a BFE into
2726 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2727 // source, bits [5:0] contain the offset and bits [22:16] the width.
2728 uint32_t PackedVal = Offset | (Width << 16);
2729 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2730
2731 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2732}
2733
2734void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2735 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2736 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2737 // Predicate: 0 < b <= c < 32
2738
2739 const SDValue &Shl = N->getOperand(0);
2740 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2741 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2742
2743 if (B && C) {
2744 uint32_t BVal = B->getZExtValue();
2745 uint32_t CVal = C->getZExtValue();
2746
2747 if (0 < BVal && BVal <= CVal && CVal < 32) {
2748 bool Signed = N->getOpcode() == ISD::SRA;
2749 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2750 32 - CVal));
2751 return;
2752 }
2753 }
2754 SelectCode(N);
2755}
2756
2757void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2758 switch (N->getOpcode()) {
2759 case ISD::AND:
2760 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2761 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2762 // Predicate: isMask(mask)
2763 const SDValue &Srl = N->getOperand(0);
2764 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2765 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2766
2767 if (Shift && Mask) {
2768 uint32_t ShiftVal = Shift->getZExtValue();
2769 uint32_t MaskVal = Mask->getZExtValue();
2770
2771 if (isMask_32(MaskVal)) {
2772 uint32_t WidthVal = llvm::popcount(MaskVal);
2773 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2774 WidthVal));
2775 return;
2776 }
2777 }
2778 }
2779 break;
2780 case ISD::SRL:
2781 if (N->getOperand(0).getOpcode() == ISD::AND) {
2782 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2783 // Predicate: isMask(mask >> b)
2784 const SDValue &And = N->getOperand(0);
2785 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2786 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2787
2788 if (Shift && Mask) {
2789 uint32_t ShiftVal = Shift->getZExtValue();
2790 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2791
2792 if (isMask_32(MaskVal)) {
2793 uint32_t WidthVal = llvm::popcount(MaskVal);
2794 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2795 WidthVal));
2796 return;
2797 }
2798 }
2799 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2800 SelectS_BFEFromShifts(N);
2801 return;
2802 }
2803 break;
2804 case ISD::SRA:
2805 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2806 SelectS_BFEFromShifts(N);
2807 return;
2808 }
2809 break;
2810
2812 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2813 SDValue Src = N->getOperand(0);
2814 if (Src.getOpcode() != ISD::SRL)
2815 break;
2816
2817 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2818 if (!Amt)
2819 break;
2820
2821 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2822 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2823 Amt->getZExtValue(), Width));
2824 return;
2825 }
2826 }
2827
2828 SelectCode(N);
2829}
2830
2831bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2832 assert(N->getOpcode() == ISD::BRCOND);
2833 if (!N->hasOneUse())
2834 return false;
2835
2836 SDValue Cond = N->getOperand(1);
2837 if (Cond.getOpcode() == ISD::CopyToReg)
2838 Cond = Cond.getOperand(2);
2839
2840 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2841 return false;
2842
2843 MVT VT = Cond.getOperand(0).getSimpleValueType();
2844 if (VT == MVT::i32)
2845 return true;
2846
2847 if (VT == MVT::i64) {
2848 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2849 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2850 Subtarget->hasScalarCompareEq64();
2851 }
2852
2853 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2854 return true;
2855
2856 return false;
2857}
2858
2859static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2860 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2861 // Special case for amdgcn.ballot:
2862 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2863 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2864 // =>
2865 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2866 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2867 // Cond becomes a i(WaveSize) full mask value.
2868 // Note that ballot doesn't use SETEQ condition but its easy to support it
2869 // here for completeness, so in this case Negate is set true on return.
2870 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2871 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2872 isNullConstant(VCMP.getOperand(1))) {
2873
2874 auto Cond = VCMP.getOperand(0);
2875 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2876 Cond = Cond.getOperand(0);
2877
2878 if (isBoolSGPR(Cond)) {
2879 Negate = VCMP_CC == ISD::SETEQ;
2880 return Cond;
2881 }
2882 }
2883 return SDValue();
2884}
2885
2886void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2887 SDValue Cond = N->getOperand(1);
2888
2889 if (Cond.isUndef()) {
2890 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2891 N->getOperand(2), N->getOperand(0));
2892 return;
2893 }
2894
2895 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2896
2897 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2898 bool AndExec = !UseSCCBr;
2899 bool Negate = false;
2900
2901 if (Cond.getOpcode() == ISD::SETCC &&
2902 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2903 SDValue VCMP = Cond->getOperand(0);
2904 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2905 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2906 isNullConstant(Cond->getOperand(1)) &&
2907 // We may encounter ballot.i64 in wave32 mode on -O0.
2908 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2909 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2910 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2911 // BRCOND i1 %C, %BB
2912 // =>
2913 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2914 // VCC = COPY i(WaveSize) %VCMP
2915 // S_CBRANCH_VCCNZ/VCCZ %BB
2916 Negate = CC == ISD::SETEQ;
2917 bool NegatedBallot = false;
2918 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2919 Cond = BallotCond;
2920 UseSCCBr = !BallotCond->isDivergent();
2921 Negate = Negate ^ NegatedBallot;
2922 } else {
2923 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2924 // selected as V_CMP, but this may change for uniform condition.
2925 Cond = VCMP;
2926 UseSCCBr = false;
2927 }
2928 }
2929 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2930 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2931 // used.
2932 AndExec = false;
2933 }
2934
2935 unsigned BrOp =
2936 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2937 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2938 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2939 SDLoc SL(N);
2940
2941 if (AndExec) {
2942 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2943 // analyzed what generates the vcc value, so we do not know whether vcc
2944 // bits for disabled lanes are 0. Thus we need to mask out bits for
2945 // disabled lanes.
2946 //
2947 // For the case that we select S_CBRANCH_SCC1 and it gets
2948 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2949 // SIInstrInfo::moveToVALU which inserts the S_AND).
2950 //
2951 // We could add an analysis of what generates the vcc value here and omit
2952 // the S_AND when is unnecessary. But it would be better to add a separate
2953 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2954 // catches both cases.
2955 Cond = SDValue(
2956 CurDAG->getMachineNode(
2957 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2958 MVT::i1,
2959 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2960 : AMDGPU::EXEC,
2961 MVT::i1),
2962 Cond),
2963 0);
2964 }
2965
2966 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2967 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2968 N->getOperand(2), // Basic Block
2969 VCC.getValue(0));
2970}
2971
2972void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2973 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2974 !N->isDivergent()) {
2975 SDValue Src = N->getOperand(0);
2976 if (Src.getValueType() == MVT::f16) {
2977 if (isExtractHiElt(Src, Src)) {
2978 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2979 {Src});
2980 return;
2981 }
2982 }
2983 }
2984
2985 SelectCode(N);
2986}
2987
2988void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2989 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2990 // be copied to an SGPR with readfirstlane.
2991 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2992 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2993
2994 SDValue Chain = N->getOperand(0);
2995 SDValue Ptr = N->getOperand(2);
2996 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2997 MachineMemOperand *MMO = M->getMemOperand();
2998 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2999
3001 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
3002 SDValue PtrBase = Ptr.getOperand(0);
3003 SDValue PtrOffset = Ptr.getOperand(1);
3004
3005 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
3006 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
3007 N = glueCopyToM0(N, PtrBase);
3008 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
3009 }
3010 }
3011
3012 if (!Offset) {
3013 N = glueCopyToM0(N, Ptr);
3014 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
3015 }
3016
3017 SDValue Ops[] = {
3018 Offset,
3019 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
3020 Chain,
3021 N->getOperand(N->getNumOperands() - 1) // New glue
3022 };
3023
3024 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3025 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3026}
3027
3028// We need to handle this here because tablegen doesn't support matching
3029// instructions with multiple outputs.
3030void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
3031 unsigned Opc;
3032 switch (IntrID) {
3033 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3034 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3035 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
3036 break;
3037 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3038 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
3039 break;
3040 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3041 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
3042 break;
3043 }
3044 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
3045 N->getOperand(5), N->getOperand(0)};
3046
3047 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3048 MachineMemOperand *MMO = M->getMemOperand();
3049 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3050 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3051}
3052
3053void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3054 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3055 unsigned Opc =
3056 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3057
3058 SmallVector<SDValue, 7> TensorOps;
3059 // First two groups
3060 TensorOps.push_back(N->getOperand(2)); // D# group 0
3061 TensorOps.push_back(N->getOperand(3)); // D# group 1
3062
3063 // Use _D2 version if both group 2 and 3 are zero-initialized.
3064 SDValue Group2 = N->getOperand(4);
3065 SDValue Group3 = N->getOperand(5);
3066 if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3068 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3069 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3070 } else { // Has at least 4 groups
3071 TensorOps.push_back(Group2); // D# group 2
3072 TensorOps.push_back(Group3); // D# group 3
3073 }
3074
3075 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3076 // for now because all existing targets only support up to 4 groups.
3077 TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3078 TensorOps.push_back(N->getOperand(7)); // cache policy
3079 TensorOps.push_back(N->getOperand(0)); // chain
3080
3081 (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3082}
3083
3084static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3085 switch (IntrID) {
3086 case Intrinsic::amdgcn_ds_gws_init:
3087 return AMDGPU::DS_GWS_INIT;
3088 case Intrinsic::amdgcn_ds_gws_barrier:
3089 return AMDGPU::DS_GWS_BARRIER;
3090 case Intrinsic::amdgcn_ds_gws_sema_v:
3091 return AMDGPU::DS_GWS_SEMA_V;
3092 case Intrinsic::amdgcn_ds_gws_sema_br:
3093 return AMDGPU::DS_GWS_SEMA_BR;
3094 case Intrinsic::amdgcn_ds_gws_sema_p:
3095 return AMDGPU::DS_GWS_SEMA_P;
3096 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3097 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3098 default:
3099 llvm_unreachable("not a gws intrinsic");
3100 }
3101}
3102
3103void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3104 if (!Subtarget->hasGWS() ||
3105 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3106 !Subtarget->hasGWSSemaReleaseAll())) {
3107 // Let this error.
3108 SelectCode(N);
3109 return;
3110 }
3111
3112 // Chain, intrinsic ID, vsrc, offset
3113 const bool HasVSrc = N->getNumOperands() == 4;
3114 assert(HasVSrc || N->getNumOperands() == 3);
3115
3116 SDLoc SL(N);
3117 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3118 int ImmOffset = 0;
3119 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3120 MachineMemOperand *MMO = M->getMemOperand();
3121
3122 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3123 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3124
3125 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3126 // offset field) % 64. Some versions of the programming guide omit the m0
3127 // part, or claim it's from offset 0.
3128 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3129 // If we have a constant offset, try to use the 0 in m0 as the base.
3130 // TODO: Look into changing the default m0 initialization value. If the
3131 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3132 // the immediate offset.
3133 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3134 ImmOffset = ConstOffset->getZExtValue();
3135 } else {
3136 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3137 ImmOffset = BaseOffset.getConstantOperandVal(1);
3138 BaseOffset = BaseOffset.getOperand(0);
3139 }
3140
3141 // Prefer to do the shift in an SGPR since it should be possible to use m0
3142 // as the result directly. If it's already an SGPR, it will be eliminated
3143 // later.
3144 SDNode *SGPROffset
3145 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3146 BaseOffset);
3147 // Shift to offset in m0
3148 SDNode *M0Base
3149 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3150 SDValue(SGPROffset, 0),
3151 CurDAG->getTargetConstant(16, SL, MVT::i32));
3152 glueCopyToM0(N, SDValue(M0Base, 0));
3153 }
3154
3155 SDValue Chain = N->getOperand(0);
3156 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3157
3158 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3159
3160 const MCInstrDesc &InstrDesc = TII->get(Opc);
3161 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3162
3163 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3164
3166 if (HasVSrc) {
3167 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3168
3169 SDValue Data = N->getOperand(2);
3170 MVT DataVT = Data.getValueType().getSimpleVT();
3171 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3172 // Normal 32-bit case.
3173 Ops.push_back(N->getOperand(2));
3174 } else {
3175 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3176 // even aligned 64-bit register class.
3177 const SDValue RegSeqOps[] = {
3178 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3179 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3180 SDValue(
3181 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3182 0),
3183 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3184
3185 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3186 SL, MVT::v2i32, RegSeqOps),
3187 0));
3188 }
3189 }
3190
3191 Ops.push_back(OffsetField);
3192 Ops.push_back(Chain);
3193
3194 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3195 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3196}
3197
3198void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3199 if (Subtarget->getLDSBankCount() != 16) {
3200 // This is a single instruction with a pattern.
3201 SelectCode(N);
3202 return;
3203 }
3204
3205 SDLoc DL(N);
3206
3207 // This requires 2 instructions. It is possible to write a pattern to support
3208 // this, but the generated isel emitter doesn't correctly deal with multiple
3209 // output instructions using the same physical register input. The copy to m0
3210 // is incorrectly placed before the second instruction.
3211 //
3212 // TODO: Match source modifiers.
3213 //
3214 // def : Pat <
3215 // (int_amdgcn_interp_p1_f16
3216 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3217 // (i32 timm:$attrchan), (i32 timm:$attr),
3218 // (i1 timm:$high), M0),
3219 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3220 // timm:$attrchan, 0,
3221 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3222 // let Predicates = [has16BankLDS];
3223 // }
3224
3225 // 16 bank LDS
3226 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3227 N->getOperand(5), SDValue());
3228
3229 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3230
3231 SDNode *InterpMov =
3232 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3233 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3234 N->getOperand(3), // Attr
3235 N->getOperand(2), // Attrchan
3236 ToM0.getValue(1) // In glue
3237 });
3238
3239 SDNode *InterpP1LV =
3240 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3241 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3242 N->getOperand(1), // Src0
3243 N->getOperand(3), // Attr
3244 N->getOperand(2), // Attrchan
3245 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3246 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3247 N->getOperand(4), // high
3248 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3249 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3250 SDValue(InterpMov, 1)
3251 });
3252
3253 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3254}
3255
3256void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3257 unsigned IntrID = N->getConstantOperandVal(1);
3258 switch (IntrID) {
3259 case Intrinsic::amdgcn_ds_append:
3260 case Intrinsic::amdgcn_ds_consume: {
3261 if (N->getValueType(0) != MVT::i32)
3262 break;
3263 SelectDSAppendConsume(N, IntrID);
3264 return;
3265 }
3266 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3267 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3268 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3269 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3270 SelectDSBvhStackIntrinsic(N, IntrID);
3271 return;
3272 case Intrinsic::amdgcn_init_whole_wave:
3273 CurDAG->getMachineFunction()
3274 .getInfo<SIMachineFunctionInfo>()
3275 ->setInitWholeWave();
3276 break;
3277 }
3278
3279 SelectCode(N);
3280}
3281
3282void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3283 unsigned IntrID = N->getConstantOperandVal(0);
3284 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3285 SDNode *ConvGlueNode = N->getGluedNode();
3286 if (ConvGlueNode) {
3287 // FIXME: Possibly iterate over multiple glue nodes?
3288 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3289 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3290 ConvGlueNode =
3291 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3292 MVT::Glue, SDValue(ConvGlueNode, 0));
3293 } else {
3294 ConvGlueNode = nullptr;
3295 }
3296 switch (IntrID) {
3297 case Intrinsic::amdgcn_wqm:
3298 Opcode = AMDGPU::WQM;
3299 break;
3300 case Intrinsic::amdgcn_softwqm:
3301 Opcode = AMDGPU::SOFT_WQM;
3302 break;
3303 case Intrinsic::amdgcn_wwm:
3304 case Intrinsic::amdgcn_strict_wwm:
3305 Opcode = AMDGPU::STRICT_WWM;
3306 break;
3307 case Intrinsic::amdgcn_strict_wqm:
3308 Opcode = AMDGPU::STRICT_WQM;
3309 break;
3310 case Intrinsic::amdgcn_interp_p1_f16:
3311 SelectInterpP1F16(N);
3312 return;
3313 case Intrinsic::amdgcn_permlane16_swap:
3314 case Intrinsic::amdgcn_permlane32_swap: {
3315 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3316 !Subtarget->hasPermlane16Swap()) ||
3317 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3318 !Subtarget->hasPermlane32Swap())) {
3319 SelectCode(N); // Hit the default error
3320 return;
3321 }
3322
3323 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3324 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3325 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3326
3327 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3328 if (ConvGlueNode)
3329 NewOps.push_back(SDValue(ConvGlueNode, 0));
3330
3331 bool FI = N->getConstantOperandVal(3);
3332 NewOps[2] = CurDAG->getTargetConstant(
3333 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3334
3335 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3336 return;
3337 }
3338 default:
3339 SelectCode(N);
3340 break;
3341 }
3342
3343 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3344 SDValue Src = N->getOperand(1);
3345 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3346 }
3347
3348 if (ConvGlueNode) {
3349 SmallVector<SDValue, 4> NewOps(N->ops());
3350 NewOps.push_back(SDValue(ConvGlueNode, 0));
3351 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3352 }
3353}
3354
3355void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3356 unsigned IntrID = N->getConstantOperandVal(1);
3357 switch (IntrID) {
3358 case Intrinsic::amdgcn_ds_gws_init:
3359 case Intrinsic::amdgcn_ds_gws_barrier:
3360 case Intrinsic::amdgcn_ds_gws_sema_v:
3361 case Intrinsic::amdgcn_ds_gws_sema_br:
3362 case Intrinsic::amdgcn_ds_gws_sema_p:
3363 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3364 SelectDS_GWS(N, IntrID);
3365 return;
3366 case Intrinsic::amdgcn_tensor_load_to_lds:
3367 case Intrinsic::amdgcn_tensor_store_from_lds:
3368 SelectTensorLoadStore(N, IntrID);
3369 return;
3370 default:
3371 break;
3372 }
3373
3374 SelectCode(N);
3375}
3376
3377void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3378 SDValue Log2WaveSize =
3379 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3380 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3381 {N->getOperand(0), Log2WaveSize});
3382}
3383
3384void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3385 SDValue SrcVal = N->getOperand(1);
3386 if (SrcVal.getValueType() != MVT::i32) {
3387 SelectCode(N); // Emit default error
3388 return;
3389 }
3390
3391 SDValue CopyVal;
3392 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3393 SDLoc SL(N);
3394
3395 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3396 CopyVal = SrcVal.getOperand(0);
3397 } else {
3398 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3399 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3400
3401 if (N->isDivergent()) {
3402 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3403 MVT::i32, SrcVal),
3404 0);
3405 }
3406
3407 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3408 {SrcVal, Log2WaveSize}),
3409 0);
3410 }
3411
3412 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3413 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3414}
3415
3416bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3417 unsigned &Mods,
3418 bool IsCanonicalizing,
3419 bool AllowAbs) const {
3420 Mods = SISrcMods::NONE;
3421 Src = In;
3422
3423 if (Src.getOpcode() == ISD::FNEG) {
3424 Mods |= SISrcMods::NEG;
3425 Src = Src.getOperand(0);
3426 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3427 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3428 // denormal mode, but we're implicitly canonicalizing in a source operand.
3429 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3430 if (LHS && LHS->isZero()) {
3431 Mods |= SISrcMods::NEG;
3432 Src = Src.getOperand(1);
3433 }
3434 }
3435
3436 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3437 Mods |= SISrcMods::ABS;
3438 Src = Src.getOperand(0);
3439 }
3440
3441 if (Mods != SISrcMods::NONE)
3442 return true;
3443
3444 // Convert various sign-bit masks on integers to src mods. Currently disabled
3445 // for 16-bit types as the codegen replaces the operand without adding a
3446 // srcmod. This is intentionally finding the cases where we are performing
3447 // float neg and abs on int types, the goal is not to obtain two's complement
3448 // neg or abs. Limit converison to select operands via the nonCanonalizing
3449 // pattern.
3450 // TODO: Add 16-bit support.
3451 if (IsCanonicalizing)
3452 return true;
3453
3454 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3455 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3456 // through the extract to the bitwise op.
3457 SDValue PeekSrc =
3458 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3459 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3460 // types as the codegen replaces the operand without adding a srcmod.
3461 // This is intentionally finding the cases where we are performing float neg
3462 // and abs on int types, the goal is not to obtain two's complement neg or
3463 // abs.
3464 // TODO: Add 16-bit support.
3465 unsigned Opc = PeekSrc.getOpcode();
3466 EVT VT = Src.getValueType();
3467 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3468 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3469 return true;
3470
3471 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3472 if (!CRHS)
3473 return true;
3474
3475 auto ReplaceSrc = [&]() -> SDValue {
3476 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3477 return Src.getOperand(0);
3478
3479 SDValue LHS = PeekSrc->getOperand(0);
3480 SDValue Index = Src->getOperand(1);
3481 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3482 Src.getValueType(), LHS, Index);
3483 };
3484
3485 // Recognise Srcmods:
3486 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3487 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3488 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3489 // SrcModifiers.
3490 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3491 Mods |= SISrcMods::NEG;
3492 Src = ReplaceSrc();
3493 } else if (Opc == ISD::AND && AllowAbs &&
3494 CRHS->getAPIntValue().isMaxSignedValue()) {
3495 Mods |= SISrcMods::ABS;
3496 Src = ReplaceSrc();
3497 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3499 Src = ReplaceSrc();
3500 }
3501
3502 return true;
3503}
3504
3505bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3506 SDValue &SrcMods) const {
3507 unsigned Mods;
3508 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3509 /*AllowAbs=*/true)) {
3510 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3511 return true;
3512 }
3513
3514 return false;
3515}
3516
3517bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3518 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3519 unsigned Mods;
3520 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3521 /*AllowAbs=*/true)) {
3522 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3523 return true;
3524 }
3525
3526 return false;
3527}
3528
3529bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3530 SDValue &SrcMods) const {
3531 unsigned Mods;
3532 if (SelectVOP3ModsImpl(In, Src, Mods,
3533 /*IsCanonicalizing=*/true,
3534 /*AllowAbs=*/false)) {
3535 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3536 return true;
3537 }
3538
3539 return false;
3540}
3541
3542bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3543 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3544 return false;
3545
3546 Src = In;
3547 return true;
3548}
3549
3550bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3551 SDValue &SrcMods,
3552 bool OpSel) const {
3553 unsigned Mods;
3554 if (SelectVOP3ModsImpl(In, Src, Mods,
3555 /*IsCanonicalizing=*/true,
3556 /*AllowAbs=*/false)) {
3557 if (OpSel)
3558 Mods |= SISrcMods::OP_SEL_0;
3559 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3560 return true;
3561 }
3562
3563 return false;
3564}
3565
3566bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3567 SDValue &SrcMods) const {
3568 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3569}
3570
3571bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3572 SDValue &SrcMods) const {
3573 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3574}
3575
3576bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3577 SDValue &SrcMods, SDValue &Clamp,
3578 SDValue &Omod) const {
3579 SDLoc DL(In);
3580 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3581 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3582
3583 return SelectVOP3Mods(In, Src, SrcMods);
3584}
3585
3586bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3587 SDValue &SrcMods, SDValue &Clamp,
3588 SDValue &Omod) const {
3589 SDLoc DL(In);
3590 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3591 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3592
3593 return SelectVOP3BMods(In, Src, SrcMods);
3594}
3595
3596bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3597 SDValue &Clamp, SDValue &Omod) const {
3598 Src = In;
3599
3600 SDLoc DL(In);
3601 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3602 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3603
3604 return true;
3605}
3606
3607bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3608 SDValue &SrcMods, bool IsDOT) const {
3609 unsigned Mods = SISrcMods::NONE;
3610 Src = In;
3611
3612 // TODO: Handle G_FSUB 0 as fneg
3613 if (Src.getOpcode() == ISD::FNEG) {
3615 Src = Src.getOperand(0);
3616 }
3617
3618 // 64-bit VOP3P instructions do not have OPSEL or ABS. Bail on v2f64 or v2i64.
3619 // TODO: Select NEG_LO and NEG_HI modifiers from BUILD_VECTOR.
3620 if (Src.getValueSizeInBits() == 128) {
3621 Mods |= SISrcMods::OP_SEL_1; // Just the default, OPSEL unsupported.
3622 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3623 return true;
3624 }
3625
3626 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3627 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3628 unsigned VecMods = Mods;
3629
3630 SDValue Lo = stripBitcast(Src.getOperand(0));
3631 SDValue Hi = stripBitcast(Src.getOperand(1));
3632
3633 if (Lo.getOpcode() == ISD::FNEG) {
3634 Lo = stripBitcast(Lo.getOperand(0));
3635 Mods ^= SISrcMods::NEG;
3636 }
3637
3638 if (Hi.getOpcode() == ISD::FNEG) {
3639 Hi = stripBitcast(Hi.getOperand(0));
3640 Mods ^= SISrcMods::NEG_HI;
3641 }
3642
3643 if (isExtractHiElt(Lo, Lo))
3644 Mods |= SISrcMods::OP_SEL_0;
3645
3646 if (isExtractHiElt(Hi, Hi))
3647 Mods |= SISrcMods::OP_SEL_1;
3648
3649 unsigned VecSize = Src.getValueSizeInBits();
3650 Lo = stripExtractLoElt(Lo);
3651 Hi = stripExtractLoElt(Hi);
3652
3653 if (Lo.getValueSizeInBits() > VecSize) {
3654 Lo = CurDAG->getTargetExtractSubreg(
3655 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3656 MVT::getIntegerVT(VecSize), Lo);
3657 }
3658
3659 if (Hi.getValueSizeInBits() > VecSize) {
3660 Hi = CurDAG->getTargetExtractSubreg(
3661 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3662 MVT::getIntegerVT(VecSize), Hi);
3663 }
3664
3665 assert(Lo.getValueSizeInBits() <= VecSize &&
3666 Hi.getValueSizeInBits() <= VecSize);
3667
3668 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3669 // Really a scalar input. Just select from the low half of the register to
3670 // avoid packing.
3671
3672 if (VecSize == Lo.getValueSizeInBits()) {
3673 Src = Lo;
3674 } else if (VecSize == 32) {
3675 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3676 } else {
3677 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3678
3679 SDLoc SL(In);
3681 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3682 Lo.getValueType()), 0);
3683 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3684 : AMDGPU::SReg_64RegClassID;
3685 const SDValue Ops[] = {
3686 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3687 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3688 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3689
3690 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3691 Src.getValueType(), Ops), 0);
3692 }
3693 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3694 return true;
3695 }
3696
3697 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3698 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3699 .bitcastToAPInt().getZExtValue();
3700 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3701 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3702 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3703 return true;
3704 }
3705 }
3706
3707 Mods = VecMods;
3708 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3709 Src.getNumOperands() == 2) {
3710
3711 // TODO: We should repeat the build_vector source check above for the
3712 // vector_shuffle for negates and casts of individual elements.
3713
3714 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3715 ArrayRef<int> Mask = SVN->getMask();
3716
3717 if (Mask[0] < 2 && Mask[1] < 2) {
3718 // src1 should be undef.
3719 SDValue ShuffleSrc = SVN->getOperand(0);
3720
3721 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3722 ShuffleSrc = ShuffleSrc.getOperand(0);
3724 }
3725
3726 if (Mask[0] == 1)
3727 Mods |= SISrcMods::OP_SEL_0;
3728 if (Mask[1] == 1)
3729 Mods |= SISrcMods::OP_SEL_1;
3730
3731 Src = ShuffleSrc;
3732 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3733 return true;
3734 }
3735 }
3736
3737 // Packed instructions do not have abs modifiers.
3738 Mods |= SISrcMods::OP_SEL_1;
3739
3740 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3741 return true;
3742}
3743
3744bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3745 SDValue &SrcMods) const {
3746 return SelectVOP3PMods(In, Src, SrcMods, true);
3747}
3748
3749bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
3750 SDValue SrcTmp, SrcModsTmp;
3751 SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);
3752 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3753 Src = SrcTmp;
3754 return true;
3755 }
3756
3757 return false;
3758}
3759
3760bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
3761 SDValue &SrcMods) const {
3762 SelectVOP3Mods(In, Src, SrcMods);
3763 unsigned Mods = SISrcMods::OP_SEL_1;
3764 Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();
3765 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3766 return true;
3767}
3768
3769bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
3770 SDValue SrcTmp, SrcModsTmp;
3771 SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);
3772 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3773 Src = SrcTmp;
3774 return true;
3775 }
3776
3777 return false;
3778}
3779
3780bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3781 SDValue &Src) const {
3782 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3783 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3784
3785 unsigned Mods = SISrcMods::OP_SEL_1;
3786 unsigned SrcVal = C->getZExtValue();
3787 if (SrcVal == 1)
3788 Mods |= SISrcMods::OP_SEL_0;
3789
3790 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3791 return true;
3792}
3793
3795AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3796 const SDLoc &DL) const {
3797 unsigned DstRegClass;
3798 EVT DstTy;
3799 switch (Elts.size()) {
3800 case 8:
3801 DstRegClass = AMDGPU::VReg_256RegClassID;
3802 DstTy = MVT::v8i32;
3803 break;
3804 case 4:
3805 DstRegClass = AMDGPU::VReg_128RegClassID;
3806 DstTy = MVT::v4i32;
3807 break;
3808 case 2:
3809 DstRegClass = AMDGPU::VReg_64RegClassID;
3810 DstTy = MVT::v2i32;
3811 break;
3812 default:
3813 llvm_unreachable("unhandled Reg sequence size");
3814 }
3815
3817 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3818 for (unsigned i = 0; i < Elts.size(); ++i) {
3819 Ops.push_back(Elts[i]);
3820 Ops.push_back(CurDAG->getTargetConstant(
3822 }
3823 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3824}
3825
3827AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3828 const SDLoc &DL) const {
3829 SmallVector<SDValue, 8> PackedElts;
3830 assert("unhandled Reg sequence size" &&
3831 (Elts.size() == 8 || Elts.size() == 16));
3832
3833 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3834 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3835 for (unsigned i = 0; i < Elts.size(); i += 2) {
3836 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3837 SDValue HiSrc;
3838 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3839 PackedElts.push_back(HiSrc);
3840 } else {
3841 if (Subtarget->useRealTrue16Insts()) {
3842 // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before
3843 // passing to v_perm_b32. Eventually we should use replace v_perm_b32
3844 // by reg_sequence.
3846 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i16),
3847 0);
3848 Elts[i] =
3849 emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID, MVT::i32,
3850 {Elts[i], Undef}, {AMDGPU::lo16, AMDGPU::hi16}, DL);
3851 Elts[i + 1] = emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID,
3852 MVT::i32, {Elts[i + 1], Undef},
3853 {AMDGPU::lo16, AMDGPU::hi16}, DL);
3854 }
3855 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3856 MachineSDNode *Packed =
3857 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3858 {Elts[i + 1], Elts[i], PackLoLo});
3859 PackedElts.push_back(SDValue(Packed, 0));
3860 }
3861 }
3862 return buildRegSequence32(PackedElts, DL);
3863}
3864
3866AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3867 const SDLoc &DL,
3868 unsigned ElementSize) const {
3869 if (ElementSize == 16)
3870 return buildRegSequence16(Elts, DL);
3871 if (ElementSize == 32)
3872 return buildRegSequence32(Elts, DL);
3873 llvm_unreachable("Unhandled element size");
3874}
3875
3876void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,
3877 unsigned &Mods,
3879 SDValue &Src, const SDLoc &DL,
3880 unsigned ElementSize) const {
3881 if (ModOpcode == ISD::FNEG) {
3882 Mods |= SISrcMods::NEG;
3883 // Check if all elements also have abs modifier
3884 SmallVector<SDValue, 8> NegAbsElts;
3885 for (auto El : Elts) {
3886 if (El.getOpcode() != ISD::FABS)
3887 break;
3888 NegAbsElts.push_back(El->getOperand(0));
3889 }
3890 if (Elts.size() != NegAbsElts.size()) {
3891 // Neg
3892 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3893 } else {
3894 // Neg and Abs
3895 Mods |= SISrcMods::NEG_HI;
3896 Src = SDValue(buildRegSequence(NegAbsElts, DL, ElementSize), 0);
3897 }
3898 } else {
3899 assert(ModOpcode == ISD::FABS);
3900 // Abs
3901 Mods |= SISrcMods::NEG_HI;
3902 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3903 }
3904}
3905
3906// Check all f16 elements for modifiers while looking through b32 and v2b16
3907// build vector, stop if element does not satisfy ModifierCheck.
3908static void
3910 std::function<bool(SDValue)> ModifierCheck) {
3911 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3912 if (auto *F16Pair =
3913 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3914 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3915 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3916 if (!ModifierCheck(ElF16))
3917 break;
3918 }
3919 }
3920 }
3921}
3922
3923bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3924 SDValue &SrcMods) const {
3925 Src = In;
3926 unsigned Mods = SISrcMods::OP_SEL_1;
3927
3928 // mods are on f16 elements
3929 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3931
3932 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3933 if (Element.getOpcode() != ISD::FNEG)
3934 return false;
3935 EltsF16.push_back(Element.getOperand(0));
3936 return true;
3937 });
3938
3939 // All elements have neg modifier
3940 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3941 Src = SDValue(buildRegSequence16(EltsF16, SDLoc(In)), 0);
3942 Mods |= SISrcMods::NEG;
3943 Mods |= SISrcMods::NEG_HI;
3944 }
3945 }
3946
3947 // mods are on v2f16 elements
3948 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3949 SmallVector<SDValue, 8> EltsV2F16;
3950 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3951 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3952 // Based on first element decide which mod we match, neg or abs
3953 if (ElV2f16.getOpcode() != ISD::FNEG)
3954 break;
3955 EltsV2F16.push_back(ElV2f16.getOperand(0));
3956 }
3957
3958 // All pairs of elements have neg modifier
3959 if (BV->getNumOperands() == EltsV2F16.size()) {
3960 Src = SDValue(buildRegSequence32(EltsV2F16, SDLoc(In)), 0);
3961 Mods |= SISrcMods::NEG;
3962 Mods |= SISrcMods::NEG_HI;
3963 }
3964 }
3965
3966 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3967 return true;
3968}
3969
3970bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3971 SDValue &SrcMods) const {
3972 Src = In;
3973 unsigned Mods = SISrcMods::OP_SEL_1;
3974 unsigned ModOpcode;
3975
3976 // mods are on f16 elements
3977 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3979 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3980 // Based on first element decide which mod we match, neg or abs
3981 if (EltsF16.empty())
3982 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3983 if (ElF16.getOpcode() != ModOpcode)
3984 return false;
3985 EltsF16.push_back(ElF16.getOperand(0));
3986 return true;
3987 });
3988
3989 // All elements have ModOpcode modifier
3990 if (BV->getNumOperands() * 2 == EltsF16.size())
3991 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, SDLoc(In), 16);
3992 }
3993
3994 // mods are on v2f16 elements
3995 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3996 SmallVector<SDValue, 8> EltsV2F16;
3997
3998 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3999 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
4000 // Based on first element decide which mod we match, neg or abs
4001 if (EltsV2F16.empty())
4002 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4003 if (ElV2f16->getOpcode() != ModOpcode)
4004 break;
4005 EltsV2F16.push_back(ElV2f16->getOperand(0));
4006 }
4007
4008 // All elements have ModOpcode modifier
4009 if (BV->getNumOperands() == EltsV2F16.size())
4010 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, SDLoc(In), 32);
4011 }
4012
4013 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4014 return true;
4015}
4016
4017bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
4018 SDValue &SrcMods) const {
4019 Src = In;
4020 unsigned Mods = SISrcMods::OP_SEL_1;
4022
4023 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4024 assert(BV->getNumOperands() > 0);
4025 // Based on first element decide which mod we match, neg or abs
4026 SDValue ElF32 = stripBitcast(BV->getOperand(0));
4027 unsigned ModOpcode =
4028 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4029 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4030 SDValue ElF32 = stripBitcast(BV->getOperand(i));
4031 if (ElF32.getOpcode() != ModOpcode)
4032 break;
4033 EltsF32.push_back(ElF32.getOperand(0));
4034 }
4035
4036 // All elements had ModOpcode modifier
4037 if (BV->getNumOperands() == EltsF32.size())
4038 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, SDLoc(In), 32);
4039 }
4040
4041 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4042 return true;
4043}
4044
4045bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
4046 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
4047 BitVector UndefElements;
4048 if (SDValue Splat = BV->getSplatValue(&UndefElements))
4049 if (isInlineImmediate(Splat.getNode())) {
4050 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
4051 unsigned Imm = C->getAPIntValue().getSExtValue();
4052 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4053 return true;
4054 }
4055 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
4056 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
4057 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4058 return true;
4059 }
4060 llvm_unreachable("unhandled Constant node");
4061 }
4062 }
4063
4064 // 16 bit splat
4065 SDValue SplatSrc32 = stripBitcast(In);
4066 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
4067 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
4068 SDValue SplatSrc16 = stripBitcast(Splat32);
4069 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
4070 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
4071 const SIInstrInfo *TII = Subtarget->getInstrInfo();
4072 std::optional<APInt> RawValue;
4073 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
4074 RawValue = C->getValueAPF().bitcastToAPInt();
4075 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
4076 RawValue = C->getAPIntValue();
4077
4078 if (RawValue.has_value()) {
4079 EVT VT = In.getValueType().getScalarType();
4080 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
4081 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
4084 RawValue.value());
4085 if (TII->isInlineConstant(FloatVal)) {
4086 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4087 MVT::i16);
4088 return true;
4089 }
4090 } else if (VT.getSimpleVT() == MVT::i16) {
4091 if (TII->isInlineConstant(RawValue.value())) {
4092 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4093 MVT::i16);
4094 return true;
4095 }
4096 } else
4097 llvm_unreachable("unknown 16-bit type");
4098 }
4099 }
4100 }
4101
4102 // Currently f64 immediate vectors are represented as vectors of v2i32, with
4103 // different lo and hi 32-bit values even though double values are splated.
4104 // So we have to manually compare to determine whether it is splated.
4105 if (CurDAG->isConstantIntBuildVectorOrConstantInt(SplatSrc32)) {
4106 int64_t Imm64 = 0;
4107 for (unsigned i = 0; i < SplatSrc32->getNumOperands(); i += 2) {
4108 auto Lo32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i));
4109 auto Hi32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i + 1));
4110 int64_t LoImm = Lo32->getAPIntValue().getSExtValue();
4111 int64_t HiImm = Hi32->getAPIntValue().getSExtValue();
4112 int64_t Imm64I = (HiImm << 32) + LoImm;
4113 if (i == 0) {
4114 if (!isInlineImmediate(APInt(64, Imm64I)))
4115 return false;
4116 Imm64 = Imm64I;
4117 } else if (Imm64I != Imm64)
4118 return false;
4119 } // end for
4120
4121 Src = CurDAG->getTargetConstant(Imm64, SDLoc(In), MVT::i64);
4122 return true;
4123 }
4124
4125 return false;
4126}
4127
4128bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4129 SDValue &IndexKey) const {
4130 unsigned Key = 0;
4131 Src = In;
4132
4133 if (In.getOpcode() == ISD::SRL) {
4134 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4135 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4136 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4137 ShiftAmt->getZExtValue() % 8 == 0) {
4138 Key = ShiftAmt->getZExtValue() / 8;
4139 Src = ShiftSrc;
4140 }
4141 }
4142
4143 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4144 return true;
4145}
4146
4147bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4148 SDValue &IndexKey) const {
4149 unsigned Key = 0;
4150 Src = In;
4151
4152 if (In.getOpcode() == ISD::SRL) {
4153 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4154 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4155 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4156 ShiftAmt->getZExtValue() == 16) {
4157 Key = 1;
4158 Src = ShiftSrc;
4159 }
4160 }
4161
4162 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4163 return true;
4164}
4165
4166bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4167 SDValue &IndexKey) const {
4168 unsigned Key = 0;
4169 Src = In;
4170
4171 SDValue InI32;
4172
4173 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4174 const SDValue &ExtendSrc = In.getOperand(0);
4175 if (ExtendSrc.getValueSizeInBits() == 32)
4176 InI32 = ExtendSrc;
4177 } else if (In->getOpcode() == ISD::BITCAST) {
4178 const SDValue &CastSrc = In.getOperand(0);
4179 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4180 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4181 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4182 if (Zero && Zero->getZExtValue() == 0)
4183 InI32 = CastSrc.getOperand(0);
4184 }
4185 }
4186
4187 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4188 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4189 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4190 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4191 EltIdx->getZExtValue() == 1) {
4192 Key = 1;
4193 Src = ExtractVecEltSrc;
4194 }
4195 }
4196
4197 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4198 return true;
4199}
4200
4201bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4202 SDValue &SrcMods) const {
4203 Src = In;
4204 // FIXME: Handle op_sel
4205 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4206 return true;
4207}
4208
4209bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4210 SDValue &SrcMods) const {
4211 // FIXME: Handle op_sel
4212 return SelectVOP3Mods(In, Src, SrcMods);
4213}
4214
4215// Match lowered fpext from bf16 to f32. This is a bit operation extending
4216// a 16-bit value with 16-bit of zeroes at LSB:
4217//
4218// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4219// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4220// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4221static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4222 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4223 return SDValue();
4224 Op = Op.getOperand(0);
4225
4226 IsExtractHigh = false;
4227 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4228 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4229 if (!Low16 || !Low16->isZero())
4230 return SDValue();
4231 Op = stripBitcast(Op.getOperand(1));
4232 if (Op.getValueType() != MVT::bf16)
4233 return SDValue();
4234 return Op;
4235 }
4236
4237 if (Op.getValueType() != MVT::i32)
4238 return SDValue();
4239
4240 if (Op.getOpcode() == ISD::AND) {
4241 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4242 if (Mask->getZExtValue() == 0xffff0000) {
4243 IsExtractHigh = true;
4244 return Op.getOperand(0);
4245 }
4246 }
4247 return SDValue();
4248 }
4249
4250 if (Op.getOpcode() == ISD::SHL) {
4251 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4252 if (Amt->getZExtValue() == 16)
4253 return Op.getOperand(0);
4254 }
4255 }
4256
4257 return SDValue();
4258}
4259
4260// The return value is not whether the match is possible (which it always is),
4261// but whether or not it a conversion is really used.
4262bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4263 unsigned &Mods,
4264 MVT VT) const {
4265 Mods = 0;
4266 SelectVOP3ModsImpl(In, Src, Mods);
4267
4268 bool IsExtractHigh = false;
4269 if (Src.getOpcode() == ISD::FP_EXTEND) {
4270 Src = Src.getOperand(0);
4271 } else if (VT == MVT::bf16) {
4272 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4273 if (!B16)
4274 return false;
4275 Src = B16;
4276 } else
4277 return false;
4278
4279 if (Src.getValueType() != VT &&
4280 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4281 return false;
4282
4283 Src = stripBitcast(Src);
4284
4285 // Be careful about folding modifiers if we already have an abs. fneg is
4286 // applied last, so we don't want to apply an earlier fneg.
4287 if ((Mods & SISrcMods::ABS) == 0) {
4288 unsigned ModsTmp;
4289 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4290
4291 if ((ModsTmp & SISrcMods::NEG) != 0)
4292 Mods ^= SISrcMods::NEG;
4293
4294 if ((ModsTmp & SISrcMods::ABS) != 0)
4295 Mods |= SISrcMods::ABS;
4296 }
4297
4298 // op_sel/op_sel_hi decide the source type and source.
4299 // If the source's op_sel_hi is set, it indicates to do a conversion from
4300 // fp16. If the sources's op_sel is set, it picks the high half of the source
4301 // register.
4302
4303 Mods |= SISrcMods::OP_SEL_1;
4304 if (Src.getValueSizeInBits() == 16) {
4305 if (isExtractHiElt(Src, Src)) {
4306 Mods |= SISrcMods::OP_SEL_0;
4307
4308 // TODO: Should we try to look for neg/abs here?
4309 return true;
4310 }
4311
4312 if (Src.getOpcode() == ISD::TRUNCATE &&
4313 Src.getOperand(0).getValueType() == MVT::i32) {
4314 Src = Src.getOperand(0);
4315 return true;
4316 }
4317
4318 if (Subtarget->useRealTrue16Insts())
4319 // In true16 mode, pack src to a 32bit
4320 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4321 } else if (IsExtractHigh)
4322 Mods |= SISrcMods::OP_SEL_0;
4323
4324 return true;
4325}
4326
4327bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4328 SDValue &SrcMods) const {
4329 unsigned Mods = 0;
4330 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4331 return false;
4332 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4333 return true;
4334}
4335
4336bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4337 SDValue &SrcMods) const {
4338 unsigned Mods = 0;
4339 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4340 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4341 return true;
4342}
4343
4344bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4345 SDValue &SrcMods) const {
4346 unsigned Mods = 0;
4347 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4348 return false;
4349 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4350 return true;
4351}
4352
4353bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4354 SDValue &SrcMods) const {
4355 unsigned Mods = 0;
4356 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4357 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4358 return true;
4359}
4360
4361// Match BITOP3 operation and return a number of matched instructions plus
4362// truth table.
4363static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4365 unsigned NumOpcodes = 0;
4366 uint8_t LHSBits, RHSBits;
4367
4368 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4369 // Define truth table given Src0, Src1, Src2 bits permutations:
4370 // 0 0 0
4371 // 0 0 1
4372 // 0 1 0
4373 // 0 1 1
4374 // 1 0 0
4375 // 1 0 1
4376 // 1 1 0
4377 // 1 1 1
4378 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4379
4380 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4381 if (C->isAllOnes()) {
4382 Bits = 0xff;
4383 return true;
4384 }
4385 if (C->isZero()) {
4386 Bits = 0;
4387 return true;
4388 }
4389 }
4390
4391 for (unsigned I = 0; I < Src.size(); ++I) {
4392 // Try to find existing reused operand
4393 if (Src[I] == Op) {
4394 Bits = SrcBits[I];
4395 return true;
4396 }
4397 // Try to replace parent operator
4398 if (Src[I] == In) {
4399 Bits = SrcBits[I];
4400 Src[I] = Op;
4401 return true;
4402 }
4403 }
4404
4405 if (Src.size() == 3) {
4406 // No room left for operands. Try one last time, there can be a 'not' of
4407 // one of our source operands. In this case we can compute the bits
4408 // without growing Src vector.
4409 if (Op.getOpcode() == ISD::XOR) {
4410 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4411 if (C->isAllOnes()) {
4412 SDValue LHS = Op.getOperand(0);
4413 for (unsigned I = 0; I < Src.size(); ++I) {
4414 if (Src[I] == LHS) {
4415 Bits = ~SrcBits[I];
4416 return true;
4417 }
4418 }
4419 }
4420 }
4421 }
4422
4423 return false;
4424 }
4425
4426 Bits = SrcBits[Src.size()];
4427 Src.push_back(Op);
4428 return true;
4429 };
4430
4431 switch (In.getOpcode()) {
4432 case ISD::AND:
4433 case ISD::OR:
4434 case ISD::XOR: {
4435 SDValue LHS = In.getOperand(0);
4436 SDValue RHS = In.getOperand(1);
4437
4438 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4439 if (!getOperandBits(LHS, LHSBits) ||
4440 !getOperandBits(RHS, RHSBits)) {
4441 Src = std::move(Backup);
4442 return std::make_pair(0, 0);
4443 }
4444
4445 // Recursion is naturally limited by the size of the operand vector.
4446 //
4447 // When LHS and RHS share a common sub-expression, one side's recursion
4448 // may decompose that sub-expression and replace the Src slot the other
4449 // side occupies with sub-operands via the "replace parent" path in
4450 // getOperandBits. The other side's cached bit-pattern then refers to a
4451 // slot whose contents changed, producing a wrong truth table.
4452 //
4453 // We detect this in three ways:
4454 // (A) If LHS recursed, its truth table is valid against the Src state
4455 // when LHS recursion completed (SrcAfterLHS). If RHS recursion
4456 // then mutates a Src slot that LHSBits depends on, LHSBits is
4457 // stale.
4458 // (B) If RHS did not recurse, RHSBits came from getOperandBits and
4459 // refers to a specific Src slot. If that slot's contents changed
4460 // (by either recursion), RHSBits is stale.
4461 // (C) Symmetrically for LHS if it did not recurse.
4462 SmallVector<SDValue, 3> SrcBeforeRecurse(Src.begin(), Src.end());
4463 uint8_t LHSBitsOrig = LHSBits;
4464 uint8_t RHSBitsOrig = RHSBits;
4465
4466 auto LHSOp = BitOp3_Op(LHS, Src);
4467 if (LHSOp.first) {
4468 NumOpcodes += LHSOp.first;
4469 LHSBits = LHSOp.second;
4470 }
4471
4472 SmallVector<SDValue, 3> SrcAfterLHS(Src.begin(), Src.end());
4473
4474 auto RHSOp = BitOp3_Op(RHS, Src);
4475 if (RHSOp.first) {
4476 NumOpcodes += RHSOp.first;
4477 RHSBits = RHSOp.second;
4478 }
4479
4480 // dependsOnSlot: true iff the truth table TT varies with slot Slot.
4481 auto dependsOnSlot = [](uint8_t TT, int Slot) -> bool {
4482 if (Slot < 0 || Slot > 2)
4483 return false;
4484 const uint8_t Masks[3] = {0x0f, 0x33, 0x55};
4485 const int Shifts[3] = {4, 2, 1};
4486 return ((TT ^ (TT >> Shifts[Slot])) & Masks[Slot]) != 0;
4487 };
4488
4489 // findSlot: locate the Src slot a getOperandBits result depends on,
4490 // including negated (XOR with -1) patterns that getOperandBits
4491 // resolves via the NOT shortcut (~SrcBits[I]).
4492 const uint8_t SrcBitsConst[3] = {0xf0, 0xcc, 0xaa};
4493 auto findSlot = [&](uint8_t Bits, SDValue Op,
4494 const SmallVectorImpl<SDValue> &S) -> int {
4495 SDValue NegatedInner;
4496 bool IsNegationOp =
4497 Op.getOpcode() == ISD::XOR && isAllOnesConstant(Op.getOperand(1));
4498 if (IsNegationOp)
4499 NegatedInner = Op.getOperand(0);
4500 for (int I = 0; I < (int)S.size(); I++) {
4501 if (Bits == SrcBitsConst[I] && S[I] == Op)
4502 return I;
4503 if (IsNegationOp && Bits == (uint8_t)~SrcBitsConst[I] &&
4504 S[I] == NegatedInner)
4505 return I;
4506 }
4507 return -1;
4508 };
4509
4510 bool Stale = false;
4511
4512 // (A) LHS recursed: its truth table is against SrcAfterLHS.
4513 // Check if RHS recursion mutated a slot that LHSBits uses.
4514 if (LHSOp.first) {
4515 for (int I = 0; I < (int)SrcAfterLHS.size() && I < 3; I++) {
4516 if (I < (int)Src.size() && Src[I] != SrcAfterLHS[I] &&
4517 dependsOnSlot(LHSBits, I)) {
4518 Stale = true;
4519 break;
4520 }
4521 }
4522 }
4523
4524 // (B) RHS did not recurse: RHSBits from getOperandBits is against
4525 // SrcBeforeRecurse. Check if that slot was mutated since then.
4526 if (!Stale && !RHSOp.first) {
4527 int Slot = findSlot(RHSBitsOrig, RHS, SrcBeforeRecurse);
4528 if (Slot >= 0 &&
4529 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4530 Stale = true;
4531 }
4532
4533 // (C) LHS did not recurse: LHSBits from getOperandBits is against
4534 // SrcBeforeRecurse. Check if that slot was mutated since then.
4535 if (!Stale && !LHSOp.first) {
4536 int Slot = findSlot(LHSBitsOrig, LHS, SrcBeforeRecurse);
4537 if (Slot >= 0 &&
4538 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4539 Stale = true;
4540 }
4541
4542 if (Stale) {
4543 Src = std::move(SrcBeforeRecurse);
4544 LHSBits = LHSBitsOrig;
4545 RHSBits = RHSBitsOrig;
4546 NumOpcodes = 0;
4547 }
4548 break;
4549 }
4550 default:
4551 return std::make_pair(0, 0);
4552 }
4553
4554 uint8_t TTbl;
4555 switch (In.getOpcode()) {
4556 case ISD::AND:
4557 TTbl = LHSBits & RHSBits;
4558 break;
4559 case ISD::OR:
4560 TTbl = LHSBits | RHSBits;
4561 break;
4562 case ISD::XOR:
4563 TTbl = LHSBits ^ RHSBits;
4564 break;
4565 default:
4566 break;
4567 }
4568
4569 return std::make_pair(NumOpcodes + 1, TTbl);
4570}
4571
4572bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4573 SDValue &Src2, SDValue &Tbl) const {
4575 uint8_t TTbl;
4576 unsigned NumOpcodes;
4577
4578 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4579
4580 // Src.empty() case can happen if all operands are all zero or all ones.
4581 // Normally it shall be optimized out before reaching this.
4582 if (NumOpcodes < 2 || Src.empty())
4583 return false;
4584
4585 // For a uniform case threshold should be higher to account for moves between
4586 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4587 // and a readtfirstlane after.
4588 if (NumOpcodes < 4 && !In->isDivergent())
4589 return false;
4590
4591 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4592 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4593 // asm more readable. This cannot be modeled with AddedComplexity because
4594 // selector does not know how many operations did we match.
4595 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4596 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4597 In.getOperand(1).getOpcode() == In.getOpcode()))
4598 return false;
4599
4600 if (In.getOpcode() == ISD::OR &&
4601 (In.getOperand(0).getOpcode() == ISD::AND ||
4602 In.getOperand(1).getOpcode() == ISD::AND))
4603 return false;
4604 }
4605
4606 // Last operand can be ignored, turning a ternary operation into a binary.
4607 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4608 // 'c' with 'a' here without changing the answer. In some pathological
4609 // cases it should be possible to get an operation with a single operand
4610 // too if optimizer would not catch it.
4611 while (Src.size() < 3)
4612 Src.push_back(Src[0]);
4613
4614 Src0 = Src[0];
4615 Src1 = Src[1];
4616 Src2 = Src[2];
4617
4618 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4619 return true;
4620}
4621
4622SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4623 if (In.isUndef())
4624 return CurDAG->getUNDEF(MVT::i32);
4625
4626 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4627 SDLoc SL(In);
4628 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4629 }
4630
4631 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4632 SDLoc SL(In);
4633 return CurDAG->getConstant(
4634 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4635 }
4636
4637 SDValue Src;
4638 if (isExtractHiElt(In, Src))
4639 return Src;
4640
4641 return SDValue();
4642}
4643
4644bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4645 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4646
4647 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4648 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4649
4650 unsigned Limit = 0;
4651 bool AllUsesAcceptSReg = true;
4652 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4653 Limit < 10 && U != E; ++U, ++Limit) {
4654 const TargetRegisterClass *RC =
4655 getOperandRegClass(U->getUser(), U->getOperandNo());
4656
4657 // If the register class is unknown, it could be an unknown
4658 // register class that needs to be an SGPR, e.g. an inline asm
4659 // constraint
4660 if (!RC || SIRI->isSGPRClass(RC))
4661 return false;
4662
4663 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4664 RC != &AMDGPU::VS_64_Align2RegClass) {
4665 AllUsesAcceptSReg = false;
4666 SDNode *User = U->getUser();
4667 if (User->isMachineOpcode()) {
4668 unsigned Opc = User->getMachineOpcode();
4669 const MCInstrDesc &Desc = SII->get(Opc);
4670 if (Desc.isCommutable()) {
4671 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4672 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4673 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4674 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4675 const TargetRegisterClass *CommutedRC =
4676 getOperandRegClass(U->getUser(), CommutedOpNo);
4677 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4678 CommutedRC == &AMDGPU::VS_64RegClass ||
4679 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4680 AllUsesAcceptSReg = true;
4681 }
4682 }
4683 }
4684 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4685 // commuting current user. This means have at least one use
4686 // that strictly require VGPR. Thus, we will not attempt to commute
4687 // other user instructions.
4688 if (!AllUsesAcceptSReg)
4689 break;
4690 }
4691 }
4692 return !AllUsesAcceptSReg && (Limit < 10);
4693}
4694
4695bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4696 const auto *Ld = cast<LoadSDNode>(N);
4697 const MachineMemOperand *MMO = Ld->getMemOperand();
4698
4699 // FIXME: We ought to able able to take the direct isDivergent result. We
4700 // cannot rely on the MMO for a uniformity check, and should stop using
4701 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4702 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4703 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4704 // version, and then this can be dropped.
4705 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4706 return false;
4707
4708 return MMO->getSize().hasValue() &&
4709 Ld->getAlign() >=
4710 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4711 uint64_t(4))) &&
4712 (MMO->isInvariant() ||
4713 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4714 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4715 (Subtarget->getScalarizeGlobalBehavior() &&
4716 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4717 Ld->isSimple() &&
4718 static_cast<const SITargetLowering *>(getTargetLowering())
4719 ->isMemOpHasNoClobberedMemOperand(N)));
4720}
4721
4724 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4725 bool IsModified = false;
4726 do {
4727 IsModified = false;
4728
4729 // Go over all selected nodes and try to fold them a bit more
4730 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4731 while (Position != CurDAG->allnodes_end()) {
4732 SDNode *Node = &*Position++;
4734 if (!MachineNode)
4735 continue;
4736
4737 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4738 if (ResNode != Node) {
4739 if (ResNode)
4740 ReplaceUses(Node, ResNode);
4741 IsModified = true;
4742 }
4743 }
4744 CurDAG->RemoveDeadNodes();
4745 } while (IsModified);
4746}
4747
4752
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:119
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
bool isSDWAOperand(const SDNode *N) const
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static SDValue stripBitcast(SDValue Val)
static const fltSemantics & BFloat()
Definition APFloat.h:296
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
LLVM_ABI PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ TargetFrameIndex
Definition ISDOpcodes.h:187
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ BRCOND
BRCOND - Conditional branch.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:860
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.