LLVM 19.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
33
34#ifdef EXPENSIVE_CHECKS
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Srl.getOperand(0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(1);
86 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
106#ifdef EXPENSIVE_CHECKS
109#endif
111 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISel(TM, OptLevel);
118}
119
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(ID, TM, OptLevel) {
123 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
124}
125
127#ifdef EXPENSIVE_CHECKS
128 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
129 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
130 for (auto &L : LI->getLoopsInPreorder()) {
131 assert(L->isLCSSAForm(DT));
132 }
133#endif
134 Subtarget = &MF.getSubtarget<GCNSubtarget>();
135 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
137}
138
139bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
140 // XXX - only need to list legal operations.
141 switch (Opc) {
142 case ISD::FADD:
143 case ISD::FSUB:
144 case ISD::FMUL:
145 case ISD::FDIV:
146 case ISD::FREM:
148 case ISD::UINT_TO_FP:
149 case ISD::SINT_TO_FP:
150 case ISD::FABS:
151 // Fabs is lowered to a bit operation, but it's an and which will clear the
152 // high bits anyway.
153 case ISD::FSQRT:
154 case ISD::FSIN:
155 case ISD::FCOS:
156 case ISD::FPOWI:
157 case ISD::FPOW:
158 case ISD::FLOG:
159 case ISD::FLOG2:
160 case ISD::FLOG10:
161 case ISD::FEXP:
162 case ISD::FEXP2:
163 case ISD::FCEIL:
164 case ISD::FTRUNC:
165 case ISD::FRINT:
166 case ISD::FNEARBYINT:
167 case ISD::FROUNDEVEN:
168 case ISD::FROUND:
169 case ISD::FFLOOR:
170 case ISD::FMINNUM:
171 case ISD::FMAXNUM:
172 case ISD::FLDEXP:
173 case AMDGPUISD::FRACT:
174 case AMDGPUISD::CLAMP:
177 case AMDGPUISD::FMIN3:
178 case AMDGPUISD::FMAX3:
179 case AMDGPUISD::FMED3:
181 case AMDGPUISD::RCP:
182 case AMDGPUISD::RSQ:
184 // On gfx10, all 16-bit instructions preserve the high bits.
185 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
186 case ISD::FP_ROUND:
187 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
188 // high bits on gfx9.
189 // TODO: If we had the source node we could see if the source was fma/mad
191 case ISD::FMA:
192 case ISD::FMAD:
195 default:
196 // fcopysign, select and others may be lowered to 32-bit bit operations
197 // which don't zero the high bits.
198 return false;
199 }
200}
201
205#ifdef EXPENSIVE_CHECKS
208#endif
210}
211
213 assert(Subtarget->d16PreservesUnusedBits());
214 MVT VT = N->getValueType(0).getSimpleVT();
215 if (VT != MVT::v2i16 && VT != MVT::v2f16)
216 return false;
217
218 SDValue Lo = N->getOperand(0);
219 SDValue Hi = N->getOperand(1);
220
221 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
222
223 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
224 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
225 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
226
227 // Need to check for possible indirect dependencies on the other half of the
228 // vector to avoid introducing a cycle.
229 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
230 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
231
233 SDValue Ops[] = {
234 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
235 };
236
237 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
238 if (LdHi->getMemoryVT() == MVT::i8) {
239 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
241 } else {
242 assert(LdHi->getMemoryVT() == MVT::i16);
243 }
244
245 SDValue NewLoadHi =
246 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
247 Ops, LdHi->getMemoryVT(),
248 LdHi->getMemOperand());
249
250 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
251 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
252 return true;
253 }
254
255 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
256 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
257 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
258 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
259 if (LdLo && Lo.hasOneUse()) {
260 SDValue TiedIn = getHi16Elt(Hi);
261 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
262 return false;
263
264 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
265 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
266 if (LdLo->getMemoryVT() == MVT::i8) {
267 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
269 } else {
270 assert(LdLo->getMemoryVT() == MVT::i16);
271 }
272
273 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
274
275 SDValue Ops[] = {
276 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
277 };
278
279 SDValue NewLoadLo =
280 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
281 Ops, LdLo->getMemoryVT(),
282 LdLo->getMemOperand());
283
284 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
285 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
286 return true;
287 }
288
289 return false;
290}
291
293 if (!Subtarget->d16PreservesUnusedBits())
294 return;
295
297
298 bool MadeChange = false;
299 while (Position != CurDAG->allnodes_begin()) {
300 SDNode *N = &*--Position;
301 if (N->use_empty())
302 continue;
303
304 switch (N->getOpcode()) {
306 // TODO: Match load d16 from shl (extload:i16), 16
307 MadeChange |= matchLoadD16FromBuildVector(N);
308 break;
309 default:
310 break;
311 }
312 }
313
314 if (MadeChange) {
316 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
317 CurDAG->dump(););
318 }
319}
320
321bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
322 if (N->isUndef())
323 return true;
324
325 const SIInstrInfo *TII = Subtarget->getInstrInfo();
326 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
327 return TII->isInlineConstant(C->getAPIntValue());
328
329 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
330 return TII->isInlineConstant(C->getValueAPF());
331
332 return false;
333}
334
335/// Determine the register class for \p OpNo
336/// \returns The register class of the virtual register that will be used for
337/// the given operand number \OpNo or NULL if the register class cannot be
338/// determined.
339const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
340 unsigned OpNo) const {
341 if (!N->isMachineOpcode()) {
342 if (N->getOpcode() == ISD::CopyToReg) {
343 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
344 if (Reg.isVirtual()) {
346 return MRI.getRegClass(Reg);
347 }
348
349 const SIRegisterInfo *TRI
350 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
351 return TRI->getPhysRegBaseClass(Reg);
352 }
353
354 return nullptr;
355 }
356
357 switch (N->getMachineOpcode()) {
358 default: {
359 const MCInstrDesc &Desc =
360 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
361 unsigned OpIdx = Desc.getNumDefs() + OpNo;
362 if (OpIdx >= Desc.getNumOperands())
363 return nullptr;
364 int RegClass = Desc.operands()[OpIdx].RegClass;
365 if (RegClass == -1)
366 return nullptr;
367
368 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
369 }
370 case AMDGPU::REG_SEQUENCE: {
371 unsigned RCID = N->getConstantOperandVal(0);
372 const TargetRegisterClass *SuperRC =
373 Subtarget->getRegisterInfo()->getRegClass(RCID);
374
375 SDValue SubRegOp = N->getOperand(OpNo + 1);
376 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
377 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
378 SubRegIdx);
379 }
380 }
381}
382
383SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
384 SDValue Glue) const {
385 SmallVector <SDValue, 8> Ops;
386 Ops.push_back(NewChain); // Replace the chain.
387 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
388 Ops.push_back(N->getOperand(i));
389
390 Ops.push_back(Glue);
391 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
392}
393
394SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
396 *static_cast<const SITargetLowering*>(getTargetLowering());
397
398 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
399
400 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
401 return glueCopyToOp(N, M0, M0.getValue(1));
402}
403
404SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
405 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
406 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
407 if (Subtarget->ldsRequiresM0Init())
408 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
409 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
411 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
412 return
413 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
414 }
415 return N;
416}
417
418MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
419 EVT VT) const {
421 AMDGPU::S_MOV_B32, DL, MVT::i32,
422 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
423 SDNode *Hi =
424 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
426 const SDValue Ops[] = {
427 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
428 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
429 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
430
431 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
432}
433
434void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
435 EVT VT = N->getValueType(0);
436 unsigned NumVectorElts = VT.getVectorNumElements();
437 EVT EltVT = VT.getVectorElementType();
438 SDLoc DL(N);
439 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
440
441 if (NumVectorElts == 1) {
442 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
443 RegClass);
444 return;
445 }
446
447 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
448 "supported yet");
449 // 32 = Max Num Vector Elements
450 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
451 // 1 = Vector Register Class
452 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
453
454 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
456 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
457 bool IsRegSeq = true;
458 unsigned NOps = N->getNumOperands();
459 for (unsigned i = 0; i < NOps; i++) {
460 // XXX: Why is this here?
461 if (isa<RegisterSDNode>(N->getOperand(i))) {
462 IsRegSeq = false;
463 break;
464 }
465 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
467 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
468 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
469 }
470 if (NOps != NumVectorElts) {
471 // Fill in the missing undef elements if this was a scalar_to_vector.
472 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
473 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
474 DL, EltVT);
475 for (unsigned i = NOps; i < NumVectorElts; ++i) {
476 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
478 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
479 RegSeqArgs[1 + (2 * i) + 1] =
480 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
481 }
482 }
483
484 if (!IsRegSeq)
485 SelectCode(N);
486 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
487}
488
490 unsigned int Opc = N->getOpcode();
491 if (N->isMachineOpcode()) {
492 N->setNodeId(-1);
493 return; // Already selected.
494 }
495
496 // isa<MemSDNode> almost works but is slightly too permissive for some DS
497 // intrinsics.
498 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
501 N = glueCopyToM0LDSInit(N);
502 SelectCode(N);
503 return;
504 }
505
506 switch (Opc) {
507 default:
508 break;
509 // We are selecting i64 ADD here instead of custom lower it during
510 // DAG legalization, so we can fold some i64 ADDs used for address
511 // calculation into the LOAD and STORE instructions.
512 case ISD::ADDC:
513 case ISD::ADDE:
514 case ISD::SUBC:
515 case ISD::SUBE: {
516 if (N->getValueType(0) != MVT::i64)
517 break;
518
519 SelectADD_SUB_I64(N);
520 return;
521 }
522 case ISD::UADDO_CARRY:
523 case ISD::USUBO_CARRY:
524 if (N->getValueType(0) != MVT::i32)
525 break;
526
527 SelectAddcSubb(N);
528 return;
529 case ISD::UADDO:
530 case ISD::USUBO: {
531 SelectUADDO_USUBO(N);
532 return;
533 }
535 SelectFMUL_W_CHAIN(N);
536 return;
537 }
539 SelectFMA_W_CHAIN(N);
540 return;
541 }
542
544 case ISD::BUILD_VECTOR: {
545 EVT VT = N->getValueType(0);
546 unsigned NumVectorElts = VT.getVectorNumElements();
547 if (VT.getScalarSizeInBits() == 16) {
548 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
549 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
550 ReplaceNode(N, Packed);
551 return;
552 }
553 }
554
555 break;
556 }
557
558 assert(VT.getVectorElementType().bitsEq(MVT::i32));
559 unsigned RegClassID =
560 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
561 SelectBuildVector(N, RegClassID);
562 return;
563 }
564 case ISD::BUILD_PAIR: {
565 SDValue RC, SubReg0, SubReg1;
566 SDLoc DL(N);
567 if (N->getValueType(0) == MVT::i128) {
568 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
569 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
570 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
571 } else if (N->getValueType(0) == MVT::i64) {
572 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
573 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
574 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
575 } else {
576 llvm_unreachable("Unhandled value type for BUILD_PAIR");
577 }
578 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
579 N->getOperand(1), SubReg1 };
580 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
581 N->getValueType(0), Ops));
582 return;
583 }
584
585 case ISD::Constant:
586 case ISD::ConstantFP: {
587 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
588 break;
589
590 uint64_t Imm;
591 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
592 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
593 if (AMDGPU::isValid32BitLiteral(Imm, true))
594 break;
595 } else {
596 ConstantSDNode *C = cast<ConstantSDNode>(N);
597 Imm = C->getZExtValue();
598 if (AMDGPU::isValid32BitLiteral(Imm, false))
599 break;
600 }
601
602 SDLoc DL(N);
603 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
604 return;
605 }
607 case AMDGPUISD::BFE_U32: {
608 // There is a scalar version available, but unlike the vector version which
609 // has a separate operand for the offset and width, the scalar version packs
610 // the width and offset into a single operand. Try to move to the scalar
611 // version if the offsets are constant, so that we can try to keep extended
612 // loads of kernel arguments in SGPRs.
613
614 // TODO: Technically we could try to pattern match scalar bitshifts of
615 // dynamic values, but it's probably not useful.
616 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
617 if (!Offset)
618 break;
619
620 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
621 if (!Width)
622 break;
623
624 bool Signed = Opc == AMDGPUISD::BFE_I32;
625
626 uint32_t OffsetVal = Offset->getZExtValue();
627 uint32_t WidthVal = Width->getZExtValue();
628
629 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
630 WidthVal));
631 return;
632 }
634 SelectDIV_SCALE(N);
635 return;
636 }
639 SelectMAD_64_32(N);
640 return;
641 }
642 case ISD::SMUL_LOHI:
643 case ISD::UMUL_LOHI:
644 return SelectMUL_LOHI(N);
645 case ISD::CopyToReg: {
647 *static_cast<const SITargetLowering*>(getTargetLowering());
648 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
649 break;
650 }
651 case ISD::AND:
652 case ISD::SRL:
653 case ISD::SRA:
655 if (N->getValueType(0) != MVT::i32)
656 break;
657
658 SelectS_BFE(N);
659 return;
660 case ISD::BRCOND:
661 SelectBRCOND(N);
662 return;
663 case ISD::FP_EXTEND:
664 SelectFP_EXTEND(N);
665 return;
671 // Hack around using a legal type if f16 is illegal.
672 if (N->getValueType(0) == MVT::i32) {
673 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
674 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
675 { N->getOperand(0), N->getOperand(1) });
676 SelectCode(N);
677 return;
678 }
679
680 break;
681 }
683 SelectINTRINSIC_W_CHAIN(N);
684 return;
685 }
687 SelectINTRINSIC_WO_CHAIN(N);
688 return;
689 }
690 case ISD::INTRINSIC_VOID: {
691 SelectINTRINSIC_VOID(N);
692 return;
693 }
695 SelectWAVE_ADDRESS(N);
696 return;
697 }
698 case ISD::STACKRESTORE: {
699 SelectSTACKRESTORE(N);
700 return;
701 }
702 }
703
704 SelectCode(N);
705}
706
707bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
708 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
709 const Instruction *Term = BB->getTerminator();
710 return Term->getMetadata("amdgpu.uniform") ||
711 Term->getMetadata("structurizecfg.uniform");
712}
713
714bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
715 unsigned ShAmtBits) const {
716 assert(N->getOpcode() == ISD::AND);
717
718 const APInt &RHS = N->getConstantOperandAPInt(1);
719 if (RHS.countr_one() >= ShAmtBits)
720 return true;
721
722 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
723 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
724}
725
727 SDValue &N0, SDValue &N1) {
728 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
729 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
730 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
731 // (i64 (bitcast (v2i32 (build_vector
732 // (or (extract_vector_elt V, 0), OFFSET),
733 // (extract_vector_elt V, 1)))))
734 SDValue Lo = Addr.getOperand(0).getOperand(0);
735 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
736 SDValue BaseLo = Lo.getOperand(0);
737 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
738 // Check that split base (Lo and Hi) are extracted from the same one.
739 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
741 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
742 // Lo is statically extracted from index 0.
743 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
744 BaseLo.getConstantOperandVal(1) == 0 &&
745 // Hi is statically extracted from index 0.
746 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
747 BaseHi.getConstantOperandVal(1) == 1) {
748 N0 = BaseLo.getOperand(0).getOperand(0);
749 N1 = Lo.getOperand(1);
750 return true;
751 }
752 }
753 }
754 return false;
755}
756
757bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
758 SDValue &RHS) const {
760 LHS = Addr.getOperand(0);
761 RHS = Addr.getOperand(1);
762 return true;
763 }
764
765 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
766 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
767 return true;
768 }
769
770 return false;
771}
772
774 return "AMDGPU DAG->DAG Pattern Instruction Selection";
775}
776
777//===----------------------------------------------------------------------===//
778// Complex Patterns
779//===----------------------------------------------------------------------===//
780
781bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
782 SDValue &Offset) {
783 return false;
784}
785
786bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
787 SDValue &Offset) {
789 SDLoc DL(Addr);
790
791 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
792 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
793 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
794 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
795 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
796 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
797 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
798 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
799 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
800 Base = Addr.getOperand(0);
801 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
802 } else {
803 Base = Addr;
804 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
805 }
806
807 return true;
808}
809
810SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
811 const SDLoc &DL) const {
813 AMDGPU::S_MOV_B32, DL, MVT::i32,
814 CurDAG->getTargetConstant(Val, DL, MVT::i32));
815 return SDValue(Mov, 0);
816}
817
818// FIXME: Should only handle uaddo_carry/usubo_carry
819void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
820 SDLoc DL(N);
821 SDValue LHS = N->getOperand(0);
822 SDValue RHS = N->getOperand(1);
823
824 unsigned Opcode = N->getOpcode();
825 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
826 bool ProduceCarry =
827 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
828 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
829
830 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
831 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
832
833 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
834 DL, MVT::i32, LHS, Sub0);
835 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
836 DL, MVT::i32, LHS, Sub1);
837
838 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
839 DL, MVT::i32, RHS, Sub0);
840 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
841 DL, MVT::i32, RHS, Sub1);
842
843 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
844
845 static const unsigned OpcMap[2][2][2] = {
846 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
847 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
848 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
849 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
850
851 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
852 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
853
854 SDNode *AddLo;
855 if (!ConsumeCarry) {
856 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
857 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
858 } else {
859 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
860 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
861 }
862 SDValue AddHiArgs[] = {
863 SDValue(Hi0, 0),
864 SDValue(Hi1, 0),
865 SDValue(AddLo, 1)
866 };
867 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
868
869 SDValue RegSequenceArgs[] = {
870 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
871 SDValue(AddLo,0),
872 Sub0,
873 SDValue(AddHi,0),
874 Sub1,
875 };
876 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
877 MVT::i64, RegSequenceArgs);
878
879 if (ProduceCarry) {
880 // Replace the carry-use
881 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
882 }
883
884 // Replace the remaining uses.
885 ReplaceNode(N, RegSequence);
886}
887
888void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
889 SDLoc DL(N);
890 SDValue LHS = N->getOperand(0);
891 SDValue RHS = N->getOperand(1);
892 SDValue CI = N->getOperand(2);
893
894 if (N->isDivergent()) {
895 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
896 : AMDGPU::V_SUBB_U32_e64;
898 N, Opc, N->getVTList(),
899 {LHS, RHS, CI,
900 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
901 } else {
902 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
903 : AMDGPU::S_SUB_CO_PSEUDO;
904 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
905 }
906}
907
908void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
909 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
910 // carry out despite the _i32 name. These were renamed in VI to _U32.
911 // FIXME: We should probably rename the opcodes here.
912 bool IsAdd = N->getOpcode() == ISD::UADDO;
913 bool IsVALU = N->isDivergent();
914
915 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
916 ++UI)
917 if (UI.getUse().getResNo() == 1) {
918 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
919 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
920 IsVALU = true;
921 break;
922 }
923 }
924
925 if (IsVALU) {
926 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
927
929 N, Opc, N->getVTList(),
930 {N->getOperand(0), N->getOperand(1),
931 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
932 } else {
933 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
934 : AMDGPU::S_USUBO_PSEUDO;
935
936 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
937 {N->getOperand(0), N->getOperand(1)});
938 }
939}
940
941void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
942 SDLoc SL(N);
943 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
944 SDValue Ops[10];
945
946 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
947 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
948 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
949 Ops[8] = N->getOperand(0);
950 Ops[9] = N->getOperand(4);
951
952 // If there are no source modifiers, prefer fmac over fma because it can use
953 // the smaller VOP2 encoding.
954 bool UseFMAC = Subtarget->hasDLInsts() &&
955 cast<ConstantSDNode>(Ops[0])->isZero() &&
956 cast<ConstantSDNode>(Ops[2])->isZero() &&
957 cast<ConstantSDNode>(Ops[4])->isZero();
958 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
959 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
960}
961
962void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
963 SDLoc SL(N);
964 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
965 SDValue Ops[8];
966
967 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
968 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
969 Ops[6] = N->getOperand(0);
970 Ops[7] = N->getOperand(3);
971
972 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
973}
974
975// We need to handle this here because tablegen doesn't support matching
976// instructions with multiple outputs.
977void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
978 SDLoc SL(N);
979 EVT VT = N->getValueType(0);
980
981 assert(VT == MVT::f32 || VT == MVT::f64);
982
983 unsigned Opc
984 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
985
986 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
987 // omod
988 SDValue Ops[8];
989 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
990 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
991 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
992 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
993}
994
995// We need to handle this here because tablegen doesn't support matching
996// instructions with multiple outputs.
997void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
998 SDLoc SL(N);
999 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1000 unsigned Opc;
1001 if (Subtarget->hasMADIntraFwdBug())
1002 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1003 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1004 else
1005 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1006
1007 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1008 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1009 Clamp };
1010 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1011}
1012
1013// We need to handle this here because tablegen doesn't support matching
1014// instructions with multiple outputs.
1015void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1016 SDLoc SL(N);
1017 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1018 unsigned Opc;
1019 if (Subtarget->hasMADIntraFwdBug())
1020 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1021 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1022 else
1023 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1024
1025 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1026 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1027 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1028 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1029 if (!SDValue(N, 0).use_empty()) {
1030 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1031 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1032 MVT::i32, SDValue(Mad, 0), Sub0);
1033 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1034 }
1035 if (!SDValue(N, 1).use_empty()) {
1036 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1037 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1038 MVT::i32, SDValue(Mad, 0), Sub1);
1039 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1040 }
1042}
1043
1044bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1045 if (!isUInt<16>(Offset))
1046 return false;
1047
1048 if (!Base || Subtarget->hasUsableDSOffset() ||
1049 Subtarget->unsafeDSOffsetFoldingEnabled())
1050 return true;
1051
1052 // On Southern Islands instruction with a negative base value and an offset
1053 // don't seem to work.
1054 return CurDAG->SignBitIsZero(Base);
1055}
1056
1057bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1058 SDValue &Offset) const {
1059 SDLoc DL(Addr);
1061 SDValue N0 = Addr.getOperand(0);
1062 SDValue N1 = Addr.getOperand(1);
1063 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1064 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1065 // (add n0, c0)
1066 Base = N0;
1067 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1068 return true;
1069 }
1070 } else if (Addr.getOpcode() == ISD::SUB) {
1071 // sub C, x -> add (sub 0, x), C
1072 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1073 int64_t ByteOffset = C->getSExtValue();
1074 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1075 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1076
1077 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1078 // the known bits in isDSOffsetLegal. We need to emit the selected node
1079 // here, so this is thrown away.
1080 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1081 Zero, Addr.getOperand(1));
1082
1083 if (isDSOffsetLegal(Sub, ByteOffset)) {
1085 Opnds.push_back(Zero);
1086 Opnds.push_back(Addr.getOperand(1));
1087
1088 // FIXME: Select to VOP3 version for with-carry.
1089 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1090 if (Subtarget->hasAddNoCarry()) {
1091 SubOp = AMDGPU::V_SUB_U32_e64;
1092 Opnds.push_back(
1093 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1094 }
1095
1096 MachineSDNode *MachineSub =
1097 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1098
1099 Base = SDValue(MachineSub, 0);
1100 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1101 return true;
1102 }
1103 }
1104 }
1105 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1106 // If we have a constant address, prefer to put the constant into the
1107 // offset. This can save moves to load the constant address since multiple
1108 // operations can share the zero base address register, and enables merging
1109 // into read2 / write2 instructions.
1110
1111 SDLoc DL(Addr);
1112
1113 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1114 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1115 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1116 DL, MVT::i32, Zero);
1117 Base = SDValue(MovZero, 0);
1118 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1119 return true;
1120 }
1121 }
1122
1123 // default case
1124 Base = Addr;
1125 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1126 return true;
1127}
1128
1129bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1130 unsigned Offset1,
1131 unsigned Size) const {
1132 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1133 return false;
1134 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1135 return false;
1136
1137 if (!Base || Subtarget->hasUsableDSOffset() ||
1138 Subtarget->unsafeDSOffsetFoldingEnabled())
1139 return true;
1140
1141 // On Southern Islands instruction with a negative base value and an offset
1142 // don't seem to work.
1143 return CurDAG->SignBitIsZero(Base);
1144}
1145
1146// Return whether the operation has NoUnsignedWrap property.
1148 return (Addr.getOpcode() == ISD::ADD &&
1149 Addr->getFlags().hasNoUnsignedWrap()) ||
1150 Addr->getOpcode() == ISD::OR;
1151}
1152
1153// Check that the base address of flat scratch load/store in the form of `base +
1154// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1155// requirement). We always treat the first operand as the base address here.
1156bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1158 return true;
1159
1160 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1161 // values.
1162 if (Subtarget->hasSignedScratchOffsets())
1163 return true;
1164
1165 auto LHS = Addr.getOperand(0);
1166 auto RHS = Addr.getOperand(1);
1167
1168 // If the immediate offset is negative and within certain range, the base
1169 // address cannot also be negative. If the base is also negative, the sum
1170 // would be either negative or much larger than the valid range of scratch
1171 // memory a thread can access.
1172 ConstantSDNode *ImmOp = nullptr;
1173 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1174 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1175 return true;
1176 }
1177
1178 return CurDAG->SignBitIsZero(LHS);
1179}
1180
1181// Check address value in SGPR/VGPR are legal for flat scratch in the form
1182// of: SGPR + VGPR.
1183bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1185 return true;
1186
1187 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1188 // values.
1189 if (Subtarget->hasSignedScratchOffsets())
1190 return true;
1191
1192 auto LHS = Addr.getOperand(0);
1193 auto RHS = Addr.getOperand(1);
1194 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1195}
1196
1197// Check address value in SGPR/VGPR are legal for flat scratch in the form
1198// of: SGPR + VGPR + Imm.
1199bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1200 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1201 // values.
1202 if (AMDGPU::isGFX12Plus(*Subtarget))
1203 return true;
1204
1205 auto Base = Addr.getOperand(0);
1206 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1207 // If the immediate offset is negative and within certain range, the base
1208 // address cannot also be negative. If the base is also negative, the sum
1209 // would be either negative or much larger than the valid range of scratch
1210 // memory a thread can access.
1211 if (isNoUnsignedWrap(Base) &&
1213 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1214 return true;
1215
1216 auto LHS = Base.getOperand(0);
1217 auto RHS = Base.getOperand(1);
1218 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1219}
1220
1221// TODO: If offset is too big, put low 16-bit into offset.
1222bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1223 SDValue &Offset0,
1224 SDValue &Offset1) const {
1225 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1226}
1227
1228bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1229 SDValue &Offset0,
1230 SDValue &Offset1) const {
1231 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1232}
1233
1234bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1235 SDValue &Offset0, SDValue &Offset1,
1236 unsigned Size) const {
1237 SDLoc DL(Addr);
1238
1240 SDValue N0 = Addr.getOperand(0);
1241 SDValue N1 = Addr.getOperand(1);
1242 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1243 unsigned OffsetValue0 = C1->getZExtValue();
1244 unsigned OffsetValue1 = OffsetValue0 + Size;
1245
1246 // (add n0, c0)
1247 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1248 Base = N0;
1249 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1250 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1251 return true;
1252 }
1253 } else if (Addr.getOpcode() == ISD::SUB) {
1254 // sub C, x -> add (sub 0, x), C
1255 if (const ConstantSDNode *C =
1256 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1257 unsigned OffsetValue0 = C->getZExtValue();
1258 unsigned OffsetValue1 = OffsetValue0 + Size;
1259
1260 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1261 SDLoc DL(Addr);
1262 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1263
1264 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1265 // the known bits in isDSOffsetLegal. We need to emit the selected node
1266 // here, so this is thrown away.
1267 SDValue Sub =
1268 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1269
1270 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1272 Opnds.push_back(Zero);
1273 Opnds.push_back(Addr.getOperand(1));
1274 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1275 if (Subtarget->hasAddNoCarry()) {
1276 SubOp = AMDGPU::V_SUB_U32_e64;
1277 Opnds.push_back(
1278 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1279 }
1280
1281 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1282 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1283
1284 Base = SDValue(MachineSub, 0);
1285 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1286 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1287 return true;
1288 }
1289 }
1290 }
1291 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1292 unsigned OffsetValue0 = CAddr->getZExtValue();
1293 unsigned OffsetValue1 = OffsetValue0 + Size;
1294
1295 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1296 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1297 MachineSDNode *MovZero =
1298 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1299 Base = SDValue(MovZero, 0);
1300 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1301 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1302 return true;
1303 }
1304 }
1305
1306 // default case
1307
1308 Base = Addr;
1309 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1310 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1311 return true;
1312}
1313
1314bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1315 SDValue &SOffset, SDValue &Offset,
1316 SDValue &Offen, SDValue &Idxen,
1317 SDValue &Addr64) const {
1318 // Subtarget prefers to use flat instruction
1319 // FIXME: This should be a pattern predicate and not reach here
1320 if (Subtarget->useFlatForGlobal())
1321 return false;
1322
1323 SDLoc DL(Addr);
1324
1325 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1326 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1327 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1328 SOffset = Subtarget->hasRestrictedSOffset()
1329 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1330 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1331
1332 ConstantSDNode *C1 = nullptr;
1333 SDValue N0 = Addr;
1335 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1336 if (isUInt<32>(C1->getZExtValue()))
1337 N0 = Addr.getOperand(0);
1338 else
1339 C1 = nullptr;
1340 }
1341
1342 if (N0.getOpcode() == ISD::ADD) {
1343 // (add N2, N3) -> addr64, or
1344 // (add (add N2, N3), C1) -> addr64
1345 SDValue N2 = N0.getOperand(0);
1346 SDValue N3 = N0.getOperand(1);
1347 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1348
1349 if (N2->isDivergent()) {
1350 if (N3->isDivergent()) {
1351 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1352 // addr64, and construct the resource from a 0 address.
1353 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1354 VAddr = N0;
1355 } else {
1356 // N2 is divergent, N3 is not.
1357 Ptr = N3;
1358 VAddr = N2;
1359 }
1360 } else {
1361 // N2 is not divergent.
1362 Ptr = N2;
1363 VAddr = N3;
1364 }
1365 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1366 } else if (N0->isDivergent()) {
1367 // N0 is divergent. Use it as the addr64, and construct the resource from a
1368 // 0 address.
1369 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1370 VAddr = N0;
1371 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1372 } else {
1373 // N0 -> offset, or
1374 // (N0 + C1) -> offset
1375 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1376 Ptr = N0;
1377 }
1378
1379 if (!C1) {
1380 // No offset.
1381 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1382 return true;
1383 }
1384
1385 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1386 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1387 // Legal offset for instruction.
1388 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1389 return true;
1390 }
1391
1392 // Illegal offset, store it in soffset.
1393 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1394 SOffset =
1396 AMDGPU::S_MOV_B32, DL, MVT::i32,
1397 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1398 0);
1399 return true;
1400}
1401
1402bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1403 SDValue &VAddr, SDValue &SOffset,
1404 SDValue &Offset) const {
1405 SDValue Ptr, Offen, Idxen, Addr64;
1406
1407 // addr64 bit was removed for volcanic islands.
1408 // FIXME: This should be a pattern predicate and not reach here
1409 if (!Subtarget->hasAddr64())
1410 return false;
1411
1412 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1413 return false;
1414
1415 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1416 if (C->getSExtValue()) {
1417 SDLoc DL(Addr);
1418
1419 const SITargetLowering& Lowering =
1420 *static_cast<const SITargetLowering*>(getTargetLowering());
1421
1422 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1423 return true;
1424 }
1425
1426 return false;
1427}
1428
1429std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1430 SDLoc DL(N);
1431
1432 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1433 SDValue TFI =
1434 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1435
1436 // We rebase the base address into an absolute stack address and hence
1437 // use constant 0 for soffset. This value must be retained until
1438 // frame elimination and eliminateFrameIndex will choose the appropriate
1439 // frame register if need be.
1440 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1441}
1442
1443bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1444 SDValue Addr, SDValue &Rsrc,
1445 SDValue &VAddr, SDValue &SOffset,
1446 SDValue &ImmOffset) const {
1447
1448 SDLoc DL(Addr);
1451
1452 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1453
1454 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1455 int64_t Imm = CAddr->getSExtValue();
1456 const int64_t NullPtr =
1458 // Don't fold null pointer.
1459 if (Imm != NullPtr) {
1460 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1461 SDValue HighBits =
1462 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1463 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1464 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1465 VAddr = SDValue(MovHighBits, 0);
1466
1467 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1468 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1469 return true;
1470 }
1471 }
1472
1474 // (add n0, c1)
1475
1476 SDValue N0 = Addr.getOperand(0);
1477 uint64_t C1 = Addr.getConstantOperandVal(1);
1478
1479 // Offsets in vaddr must be positive if range checking is enabled.
1480 //
1481 // The total computation of vaddr + soffset + offset must not overflow. If
1482 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1483 // overflowing.
1484 //
1485 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1486 // always perform a range check. If a negative vaddr base index was used,
1487 // this would fail the range check. The overall address computation would
1488 // compute a valid address, but this doesn't happen due to the range
1489 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1490 //
1491 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1492 // MUBUF vaddr, but not on older subtargets which can only do this if the
1493 // sign bit is known 0.
1494 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1495 if (TII->isLegalMUBUFImmOffset(C1) &&
1496 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1497 CurDAG->SignBitIsZero(N0))) {
1498 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1499 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1500 return true;
1501 }
1502 }
1503
1504 // (node)
1505 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1506 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1507 return true;
1508}
1509
1510static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1511 if (Val.getOpcode() != ISD::CopyFromReg)
1512 return false;
1513 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1514 if (!Reg.isPhysical())
1515 return false;
1516 auto RC = TRI.getPhysRegBaseClass(Reg);
1517 return RC && TRI.isSGPRClass(RC);
1518}
1519
1520bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1521 SDValue Addr,
1522 SDValue &SRsrc,
1523 SDValue &SOffset,
1524 SDValue &Offset) const {
1525 const SIRegisterInfo *TRI =
1526 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1530 SDLoc DL(Addr);
1531
1532 // CopyFromReg <sgpr>
1533 if (IsCopyFromSGPR(*TRI, Addr)) {
1534 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1535 SOffset = Addr;
1536 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1537 return true;
1538 }
1539
1540 ConstantSDNode *CAddr;
1541 if (Addr.getOpcode() == ISD::ADD) {
1542 // Add (CopyFromReg <sgpr>) <constant>
1543 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1544 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1545 return false;
1546 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1547 return false;
1548
1549 SOffset = Addr.getOperand(0);
1550 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1551 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1552 // <constant>
1553 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1554 } else {
1555 return false;
1556 }
1557
1558 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1559
1560 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1561 return true;
1562}
1563
1564bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1565 SDValue &SOffset, SDValue &Offset
1566 ) const {
1567 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1568 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1569
1570 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1571 return false;
1572
1573 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1574 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1575 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1576 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1577 APInt::getAllOnes(32).getZExtValue(); // Size
1578 SDLoc DL(Addr);
1579
1580 const SITargetLowering& Lowering =
1581 *static_cast<const SITargetLowering*>(getTargetLowering());
1582
1583 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1584 return true;
1585 }
1586 return false;
1587}
1588
1589bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1590 SDValue &SOffset) const {
1591 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1592 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1593 return true;
1594 }
1595
1596 SOffset = ByteOffsetNode;
1597 return true;
1598}
1599
1600// Find a load or store from corresponding pattern root.
1601// Roots may be build_vector, bitconvert or their combinations.
1604 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1605 return MN;
1606 assert(isa<BuildVectorSDNode>(N));
1607 for (SDValue V : N->op_values())
1608 if (MemSDNode *MN =
1609 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1610 return MN;
1611 llvm_unreachable("cannot find MemSDNode in the pattern!");
1612}
1613
1614bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1615 SDValue &VAddr, SDValue &Offset,
1616 uint64_t FlatVariant) const {
1617 int64_t OffsetVal = 0;
1618
1619 unsigned AS = findMemSDNode(N)->getAddressSpace();
1620
1621 bool CanHaveFlatSegmentOffsetBug =
1622 Subtarget->hasFlatSegmentOffsetBug() &&
1623 FlatVariant == SIInstrFlags::FLAT &&
1625
1626 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1627 SDValue N0, N1;
1628 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1629 (FlatVariant != SIInstrFlags::FlatScratch ||
1630 isFlatScratchBaseLegal(Addr))) {
1631 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1632
1633 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1634 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1635 Addr = N0;
1636 OffsetVal = COffsetVal;
1637 } else {
1638 // If the offset doesn't fit, put the low bits into the offset field and
1639 // add the rest.
1640 //
1641 // For a FLAT instruction the hardware decides whether to access
1642 // global/scratch/shared memory based on the high bits of vaddr,
1643 // ignoring the offset field, so we have to ensure that when we add
1644 // remainder to vaddr it still points into the same underlying object.
1645 // The easiest way to do that is to make sure that we split the offset
1646 // into two pieces that are both >= 0 or both <= 0.
1647
1648 SDLoc DL(N);
1649 uint64_t RemainderOffset;
1650
1651 std::tie(OffsetVal, RemainderOffset) =
1652 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1653
1654 SDValue AddOffsetLo =
1655 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1656 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1657
1658 if (Addr.getValueType().getSizeInBits() == 32) {
1660 Opnds.push_back(N0);
1661 Opnds.push_back(AddOffsetLo);
1662 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1663 if (Subtarget->hasAddNoCarry()) {
1664 AddOp = AMDGPU::V_ADD_U32_e64;
1665 Opnds.push_back(Clamp);
1666 }
1667 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1668 } else {
1669 // TODO: Should this try to use a scalar add pseudo if the base address
1670 // is uniform and saddr is usable?
1671 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1672 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1673
1674 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1675 DL, MVT::i32, N0, Sub0);
1676 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1677 DL, MVT::i32, N0, Sub1);
1678
1679 SDValue AddOffsetHi =
1680 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1681
1682 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1683
1684 SDNode *Add =
1685 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1686 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1687
1688 SDNode *Addc = CurDAG->getMachineNode(
1689 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1690 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1691
1692 SDValue RegSequenceArgs[] = {
1693 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1694 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1695
1696 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1697 MVT::i64, RegSequenceArgs),
1698 0);
1699 }
1700 }
1701 }
1702 }
1703
1704 VAddr = Addr;
1705 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1706 return true;
1707}
1708
1709bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1710 SDValue &VAddr,
1711 SDValue &Offset) const {
1712 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1713}
1714
1715bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1716 SDValue &VAddr,
1717 SDValue &Offset) const {
1718 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1719}
1720
1721bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1722 SDValue &VAddr,
1723 SDValue &Offset) const {
1724 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1726}
1727
1728// If this matches zero_extend i32:x, return x
1730 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1731 return SDValue();
1732
1733 SDValue ExtSrc = Op.getOperand(0);
1734 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1735}
1736
1737// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1738bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1739 SDValue Addr,
1740 SDValue &SAddr,
1741 SDValue &VOffset,
1742 SDValue &Offset) const {
1743 int64_t ImmOffset = 0;
1744
1745 // Match the immediate offset first, which canonically is moved as low as
1746 // possible.
1747
1748 SDValue LHS, RHS;
1749 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1750 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1751 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1752
1753 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1755 Addr = LHS;
1756 ImmOffset = COffsetVal;
1757 } else if (!LHS->isDivergent()) {
1758 if (COffsetVal > 0) {
1759 SDLoc SL(N);
1760 // saddr + large_offset -> saddr +
1761 // (voffset = large_offset & ~MaxOffset) +
1762 // (large_offset & MaxOffset);
1763 int64_t SplitImmOffset, RemainderOffset;
1764 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1766
1767 if (isUInt<32>(RemainderOffset)) {
1768 SDNode *VMov = CurDAG->getMachineNode(
1769 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1770 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1771 VOffset = SDValue(VMov, 0);
1772 SAddr = LHS;
1773 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1774 return true;
1775 }
1776 }
1777
1778 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1779 // is 1 we would need to perform 1 or 2 extra moves for each half of
1780 // the constant and it is better to do a scalar add and then issue a
1781 // single VALU instruction to materialize zero. Otherwise it is less
1782 // instructions to perform VALU adds with immediates or inline literals.
1783 unsigned NumLiterals =
1784 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1785 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1786 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1787 return false;
1788 }
1789 }
1790
1791 // Match the variable offset.
1792 if (Addr.getOpcode() == ISD::ADD) {
1793 LHS = Addr.getOperand(0);
1794 RHS = Addr.getOperand(1);
1795
1796 if (!LHS->isDivergent()) {
1797 // add (i64 sgpr), (zero_extend (i32 vgpr))
1798 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1799 SAddr = LHS;
1800 VOffset = ZextRHS;
1801 }
1802 }
1803
1804 if (!SAddr && !RHS->isDivergent()) {
1805 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1806 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1807 SAddr = RHS;
1808 VOffset = ZextLHS;
1809 }
1810 }
1811
1812 if (SAddr) {
1813 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1814 return true;
1815 }
1816 }
1817
1818 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1819 isa<ConstantSDNode>(Addr))
1820 return false;
1821
1822 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1823 // moves required to copy a 64-bit SGPR to VGPR.
1824 SAddr = Addr;
1825 SDNode *VMov =
1826 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1827 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1828 VOffset = SDValue(VMov, 0);
1829 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1830 return true;
1831}
1832
1834 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1835 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1836 } else if (SAddr.getOpcode() == ISD::ADD &&
1837 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1838 // Materialize this into a scalar move for scalar address to avoid
1839 // readfirstlane.
1840 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1841 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1842 FI->getValueType(0));
1843 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1844 MVT::i32, TFI, SAddr.getOperand(1)),
1845 0);
1846 }
1847
1848 return SAddr;
1849}
1850
1851// Match (32-bit SGPR base) + sext(imm offset)
1852bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1853 SDValue &SAddr,
1854 SDValue &Offset) const {
1855 if (Addr->isDivergent())
1856 return false;
1857
1858 SDLoc DL(Addr);
1859
1860 int64_t COffsetVal = 0;
1861
1862 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1863 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1864 SAddr = Addr.getOperand(0);
1865 } else {
1866 SAddr = Addr;
1867 }
1868
1869 SAddr = SelectSAddrFI(CurDAG, SAddr);
1870
1871 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1872
1873 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1875 int64_t SplitImmOffset, RemainderOffset;
1876 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1878
1879 COffsetVal = SplitImmOffset;
1880
1881 SDValue AddOffset =
1883 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1884 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1885 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1886 SAddr, AddOffset),
1887 0);
1888 }
1889
1890 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1891
1892 return true;
1893}
1894
1895// Check whether the flat scratch SVS swizzle bug affects this access.
1896bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1897 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1898 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1899 return false;
1900
1901 // The bug affects the swizzling of SVS accesses if there is any carry out
1902 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1903 // voffset to (soffset + inst_offset).
1904 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1906 /*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1907 CurDAG->computeKnownBits(SAddr),
1908 KnownBits::makeConstant(APInt(32, ImmOffset)));
1909 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1911 return (VMax & 3) + (SMax & 3) >= 4;
1912}
1913
1914bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1915 SDValue &VAddr, SDValue &SAddr,
1916 SDValue &Offset) const {
1917 int64_t ImmOffset = 0;
1918
1919 SDValue LHS, RHS;
1920 SDValue OrigAddr = Addr;
1921 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1922 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1923 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1924
1925 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1926 Addr = LHS;
1927 ImmOffset = COffsetVal;
1928 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1929 SDLoc SL(N);
1930 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1931 // (large_offset & MaxOffset);
1932 int64_t SplitImmOffset, RemainderOffset;
1933 std::tie(SplitImmOffset, RemainderOffset)
1934 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1935
1936 if (isUInt<32>(RemainderOffset)) {
1937 SDNode *VMov = CurDAG->getMachineNode(
1938 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1939 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1940 VAddr = SDValue(VMov, 0);
1941 SAddr = LHS;
1942 if (!isFlatScratchBaseLegal(Addr))
1943 return false;
1944 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1945 return false;
1946 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1947 return true;
1948 }
1949 }
1950 }
1951
1952 if (Addr.getOpcode() != ISD::ADD)
1953 return false;
1954
1955 LHS = Addr.getOperand(0);
1956 RHS = Addr.getOperand(1);
1957
1958 if (!LHS->isDivergent() && RHS->isDivergent()) {
1959 SAddr = LHS;
1960 VAddr = RHS;
1961 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1962 SAddr = RHS;
1963 VAddr = LHS;
1964 } else {
1965 return false;
1966 }
1967
1968 if (OrigAddr != Addr) {
1969 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1970 return false;
1971 } else {
1972 if (!isFlatScratchBaseLegalSV(OrigAddr))
1973 return false;
1974 }
1975
1976 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1977 return false;
1978 SAddr = SelectSAddrFI(CurDAG, SAddr);
1979 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1980 return true;
1981}
1982
1983// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
1984// not null) offset. If Imm32Only is true, match only 32-bit immediate
1985// offsets available on CI.
1986bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1987 SDValue *SOffset, SDValue *Offset,
1988 bool Imm32Only, bool IsBuffer) const {
1989 assert((!SOffset || !Offset) &&
1990 "Cannot match both soffset and offset at the same time!");
1991
1992 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1993 if (!C) {
1994 if (!SOffset)
1995 return false;
1996 if (ByteOffsetNode.getValueType().isScalarInteger() &&
1997 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1998 *SOffset = ByteOffsetNode;
1999 return true;
2000 }
2001 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2002 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2003 *SOffset = ByteOffsetNode.getOperand(0);
2004 return true;
2005 }
2006 }
2007 return false;
2008 }
2009
2010 SDLoc SL(ByteOffsetNode);
2011
2012 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2013 // offset for S_BUFFER instructions is unsigned.
2014 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2015 std::optional<int64_t> EncodedOffset =
2016 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
2017 if (EncodedOffset && Offset && !Imm32Only) {
2018 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2019 return true;
2020 }
2021
2022 // SGPR and literal offsets are unsigned.
2023 if (ByteOffset < 0)
2024 return false;
2025
2026 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2027 if (EncodedOffset && Offset && Imm32Only) {
2028 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2029 return true;
2030 }
2031
2032 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2033 return false;
2034
2035 if (SOffset) {
2036 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2037 *SOffset = SDValue(
2038 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2039 return true;
2040 }
2041
2042 return false;
2043}
2044
2045SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2046 if (Addr.getValueType() != MVT::i32)
2047 return Addr;
2048
2049 // Zero-extend a 32-bit address.
2050 SDLoc SL(Addr);
2051
2054 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2055 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2056
2057 const SDValue Ops[] = {
2058 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2059 Addr,
2060 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2061 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2062 0),
2063 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2064 };
2065
2066 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2067 Ops), 0);
2068}
2069
2070// Match a base and an immediate (if Offset is not null) or an SGPR (if
2071// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2072// true, match only 32-bit immediate offsets available on CI.
2073bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2074 SDValue *SOffset, SDValue *Offset,
2075 bool Imm32Only,
2076 bool IsBuffer) const {
2077 if (SOffset && Offset) {
2078 assert(!Imm32Only && !IsBuffer);
2079 SDValue B;
2080 return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
2081 SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
2082 }
2083
2084 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2085 // wraparound, because s_load instructions perform the addition in 64 bits.
2086 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2087 !Addr->getFlags().hasNoUnsignedWrap())
2088 return false;
2089
2090 SDValue N0, N1;
2091 // Extract the base and offset if possible.
2092 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2093 N0 = Addr.getOperand(0);
2094 N1 = Addr.getOperand(1);
2095 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2096 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2097 }
2098 if (!N0 || !N1)
2099 return false;
2100 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2101 SBase = N0;
2102 return true;
2103 }
2104 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2105 SBase = N1;
2106 return true;
2107 }
2108 return false;
2109}
2110
2111bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2112 SDValue *SOffset, SDValue *Offset,
2113 bool Imm32Only) const {
2114 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2115 SBase = Expand32BitAddress(SBase);
2116 return true;
2117 }
2118
2119 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2120 SBase = Expand32BitAddress(Addr);
2121 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2122 return true;
2123 }
2124
2125 return false;
2126}
2127
2128bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2129 SDValue &Offset) const {
2130 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2131}
2132
2133bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2134 SDValue &Offset) const {
2136 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2137 /* Imm32Only */ true);
2138}
2139
2140bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2141 SDValue &SOffset) const {
2142 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2143}
2144
2145bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2146 SDValue &SOffset,
2147 SDValue &Offset) const {
2148 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2149}
2150
2151bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2152 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2153 /* Imm32Only */ false, /* IsBuffer */ true);
2154}
2155
2156bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2157 SDValue &Offset) const {
2159 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2160 /* Imm32Only */ true, /* IsBuffer */ true);
2161}
2162
2163bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2164 SDValue &Offset) const {
2165 // Match the (soffset + offset) pair as a 32-bit register base and
2166 // an immediate offset.
2167 return N.getValueType() == MVT::i32 &&
2168 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2169 &Offset, /* Imm32Only */ false,
2170 /* IsBuffer */ true);
2171}
2172
2173bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2174 SDValue &Base,
2175 SDValue &Offset) const {
2176 SDLoc DL(Index);
2177
2179 SDValue N0 = Index.getOperand(0);
2180 SDValue N1 = Index.getOperand(1);
2181 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2182
2183 // (add n0, c0)
2184 // Don't peel off the offset (c0) if doing so could possibly lead
2185 // the base (n0) to be negative.
2186 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2187 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2188 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2189 Base = N0;
2190 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2191 return true;
2192 }
2193 }
2194
2195 if (isa<ConstantSDNode>(Index))
2196 return false;
2197
2198 Base = Index;
2199 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2200 return true;
2201}
2202
2203SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2204 SDValue Val, uint32_t Offset,
2205 uint32_t Width) {
2206 if (Val->isDivergent()) {
2207 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2209 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2210
2211 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2212 }
2213 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2214 // Transformation function, pack the offset and width of a BFE into
2215 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2216 // source, bits [5:0] contain the offset and bits [22:16] the width.
2217 uint32_t PackedVal = Offset | (Width << 16);
2218 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2219
2220 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2221}
2222
2223void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2224 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2225 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2226 // Predicate: 0 < b <= c < 32
2227
2228 const SDValue &Shl = N->getOperand(0);
2229 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2230 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2231
2232 if (B && C) {
2233 uint32_t BVal = B->getZExtValue();
2234 uint32_t CVal = C->getZExtValue();
2235
2236 if (0 < BVal && BVal <= CVal && CVal < 32) {
2237 bool Signed = N->getOpcode() == ISD::SRA;
2238 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2239 32 - CVal));
2240 return;
2241 }
2242 }
2243 SelectCode(N);
2244}
2245
2246void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2247 switch (N->getOpcode()) {
2248 case ISD::AND:
2249 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2250 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2251 // Predicate: isMask(mask)
2252 const SDValue &Srl = N->getOperand(0);
2253 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2254 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2255
2256 if (Shift && Mask) {
2257 uint32_t ShiftVal = Shift->getZExtValue();
2258 uint32_t MaskVal = Mask->getZExtValue();
2259
2260 if (isMask_32(MaskVal)) {
2261 uint32_t WidthVal = llvm::popcount(MaskVal);
2262 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2263 WidthVal));
2264 return;
2265 }
2266 }
2267 }
2268 break;
2269 case ISD::SRL:
2270 if (N->getOperand(0).getOpcode() == ISD::AND) {
2271 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2272 // Predicate: isMask(mask >> b)
2273 const SDValue &And = N->getOperand(0);
2274 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2275 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2276
2277 if (Shift && Mask) {
2278 uint32_t ShiftVal = Shift->getZExtValue();
2279 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2280
2281 if (isMask_32(MaskVal)) {
2282 uint32_t WidthVal = llvm::popcount(MaskVal);
2283 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2284 WidthVal));
2285 return;
2286 }
2287 }
2288 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2289 SelectS_BFEFromShifts(N);
2290 return;
2291 }
2292 break;
2293 case ISD::SRA:
2294 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2295 SelectS_BFEFromShifts(N);
2296 return;
2297 }
2298 break;
2299
2301 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2302 SDValue Src = N->getOperand(0);
2303 if (Src.getOpcode() != ISD::SRL)
2304 break;
2305
2306 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2307 if (!Amt)
2308 break;
2309
2310 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2311 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2312 Amt->getZExtValue(), Width));
2313 return;
2314 }
2315 }
2316
2317 SelectCode(N);
2318}
2319
2320bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2321 assert(N->getOpcode() == ISD::BRCOND);
2322 if (!N->hasOneUse())
2323 return false;
2324
2325 SDValue Cond = N->getOperand(1);
2326 if (Cond.getOpcode() == ISD::CopyToReg)
2327 Cond = Cond.getOperand(2);
2328
2329 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2330 return false;
2331
2332 MVT VT = Cond.getOperand(0).getSimpleValueType();
2333 if (VT == MVT::i32)
2334 return true;
2335
2336 if (VT == MVT::i64) {
2337 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2338
2339 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2340 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2341 }
2342
2343 return false;
2344}
2345
2346static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2347 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2348 // Special case for amdgcn.ballot:
2349 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2350 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2351 // =>
2352 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2353 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2354 // Cond becomes a i(WaveSize) full mask value.
2355 // Note that ballot doesn't use SETEQ condition but its easy to support it
2356 // here for completeness, so in this case Negate is set true on return.
2357 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2358 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2359 isNullConstant(VCMP.getOperand(1))) {
2360
2361 auto Cond = VCMP.getOperand(0);
2362 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2363 Cond = Cond.getOperand(0);
2364
2365 if (isBoolSGPR(Cond)) {
2366 Negate = VCMP_CC == ISD::SETEQ;
2367 return Cond;
2368 }
2369 }
2370 return SDValue();
2371}
2372
2373void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2374 SDValue Cond = N->getOperand(1);
2375
2376 if (Cond.isUndef()) {
2377 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2378 N->getOperand(2), N->getOperand(0));
2379 return;
2380 }
2381
2382 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2383 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2384
2385 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2386 bool AndExec = !UseSCCBr;
2387 bool Negate = false;
2388
2389 if (Cond.getOpcode() == ISD::SETCC &&
2390 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2391 SDValue VCMP = Cond->getOperand(0);
2392 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2393 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2394 isNullConstant(Cond->getOperand(1)) &&
2395 // We may encounter ballot.i64 in wave32 mode on -O0.
2396 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2397 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2398 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2399 // BRCOND i1 %C, %BB
2400 // =>
2401 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2402 // VCC = COPY i(WaveSize) %VCMP
2403 // S_CBRANCH_VCCNZ/VCCZ %BB
2404 Negate = CC == ISD::SETEQ;
2405 bool NegatedBallot = false;
2406 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2407 Cond = BallotCond;
2408 UseSCCBr = !BallotCond->isDivergent();
2409 Negate = Negate ^ NegatedBallot;
2410 } else {
2411 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2412 // selected as V_CMP, but this may change for uniform condition.
2413 Cond = VCMP;
2414 UseSCCBr = false;
2415 }
2416 }
2417 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2418 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2419 // used.
2420 AndExec = false;
2421 }
2422
2423 unsigned BrOp =
2424 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2425 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2426 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2427 SDLoc SL(N);
2428
2429 if (AndExec) {
2430 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2431 // analyzed what generates the vcc value, so we do not know whether vcc
2432 // bits for disabled lanes are 0. Thus we need to mask out bits for
2433 // disabled lanes.
2434 //
2435 // For the case that we select S_CBRANCH_SCC1 and it gets
2436 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2437 // SIInstrInfo::moveToVALU which inserts the S_AND).
2438 //
2439 // We could add an analysis of what generates the vcc value here and omit
2440 // the S_AND when is unnecessary. But it would be better to add a separate
2441 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2442 // catches both cases.
2443 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2444 : AMDGPU::S_AND_B64,
2445 SL, MVT::i1,
2446 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2447 : AMDGPU::EXEC,
2448 MVT::i1),
2449 Cond),
2450 0);
2451 }
2452
2453 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2454 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2455 N->getOperand(2), // Basic Block
2456 VCC.getValue(0));
2457}
2458
2459void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2460 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2461 !N->isDivergent()) {
2462 SDValue Src = N->getOperand(0);
2463 if (Src.getValueType() == MVT::f16) {
2464 if (isExtractHiElt(Src, Src)) {
2465 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2466 {Src});
2467 return;
2468 }
2469 }
2470 }
2471
2472 SelectCode(N);
2473}
2474
2475void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2476 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2477 // be copied to an SGPR with readfirstlane.
2478 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2479 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2480
2481 SDValue Chain = N->getOperand(0);
2482 SDValue Ptr = N->getOperand(2);
2483 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2484 MachineMemOperand *MMO = M->getMemOperand();
2485 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2486
2489 SDValue PtrBase = Ptr.getOperand(0);
2490 SDValue PtrOffset = Ptr.getOperand(1);
2491
2492 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2493 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2494 N = glueCopyToM0(N, PtrBase);
2495 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2496 }
2497 }
2498
2499 if (!Offset) {
2500 N = glueCopyToM0(N, Ptr);
2501 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2502 }
2503
2504 SDValue Ops[] = {
2505 Offset,
2506 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2507 Chain,
2508 N->getOperand(N->getNumOperands() - 1) // New glue
2509 };
2510
2511 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2512 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2513}
2514
2515// We need to handle this here because tablegen doesn't support matching
2516// instructions with multiple outputs.
2517void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2518 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2519 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2520 N->getOperand(5), N->getOperand(0)};
2521
2522 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2523 MachineMemOperand *MMO = M->getMemOperand();
2524 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2525 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2526}
2527
2528static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2529 switch (IntrID) {
2530 case Intrinsic::amdgcn_ds_gws_init:
2531 return AMDGPU::DS_GWS_INIT;
2532 case Intrinsic::amdgcn_ds_gws_barrier:
2533 return AMDGPU::DS_GWS_BARRIER;
2534 case Intrinsic::amdgcn_ds_gws_sema_v:
2535 return AMDGPU::DS_GWS_SEMA_V;
2536 case Intrinsic::amdgcn_ds_gws_sema_br:
2537 return AMDGPU::DS_GWS_SEMA_BR;
2538 case Intrinsic::amdgcn_ds_gws_sema_p:
2539 return AMDGPU::DS_GWS_SEMA_P;
2540 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2541 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2542 default:
2543 llvm_unreachable("not a gws intrinsic");
2544 }
2545}
2546
2547void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2548 if (!Subtarget->hasGWS() ||
2549 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2550 !Subtarget->hasGWSSemaReleaseAll())) {
2551 // Let this error.
2552 SelectCode(N);
2553 return;
2554 }
2555
2556 // Chain, intrinsic ID, vsrc, offset
2557 const bool HasVSrc = N->getNumOperands() == 4;
2558 assert(HasVSrc || N->getNumOperands() == 3);
2559
2560 SDLoc SL(N);
2561 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2562 int ImmOffset = 0;
2563 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2564 MachineMemOperand *MMO = M->getMemOperand();
2565
2566 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2567 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2568
2569 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2570 // offset field) % 64. Some versions of the programming guide omit the m0
2571 // part, or claim it's from offset 0.
2572 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2573 // If we have a constant offset, try to use the 0 in m0 as the base.
2574 // TODO: Look into changing the default m0 initialization value. If the
2575 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2576 // the immediate offset.
2577 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2578 ImmOffset = ConstOffset->getZExtValue();
2579 } else {
2580 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2581 ImmOffset = BaseOffset.getConstantOperandVal(1);
2582 BaseOffset = BaseOffset.getOperand(0);
2583 }
2584
2585 // Prefer to do the shift in an SGPR since it should be possible to use m0
2586 // as the result directly. If it's already an SGPR, it will be eliminated
2587 // later.
2588 SDNode *SGPROffset
2589 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2590 BaseOffset);
2591 // Shift to offset in m0
2592 SDNode *M0Base
2593 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2594 SDValue(SGPROffset, 0),
2595 CurDAG->getTargetConstant(16, SL, MVT::i32));
2596 glueCopyToM0(N, SDValue(M0Base, 0));
2597 }
2598
2599 SDValue Chain = N->getOperand(0);
2600 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2601
2602 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2604 if (HasVSrc)
2605 Ops.push_back(N->getOperand(2));
2606 Ops.push_back(OffsetField);
2607 Ops.push_back(Chain);
2608
2609 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2610 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2611}
2612
2613void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2614 if (Subtarget->getLDSBankCount() != 16) {
2615 // This is a single instruction with a pattern.
2616 SelectCode(N);
2617 return;
2618 }
2619
2620 SDLoc DL(N);
2621
2622 // This requires 2 instructions. It is possible to write a pattern to support
2623 // this, but the generated isel emitter doesn't correctly deal with multiple
2624 // output instructions using the same physical register input. The copy to m0
2625 // is incorrectly placed before the second instruction.
2626 //
2627 // TODO: Match source modifiers.
2628 //
2629 // def : Pat <
2630 // (int_amdgcn_interp_p1_f16
2631 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2632 // (i32 timm:$attrchan), (i32 timm:$attr),
2633 // (i1 timm:$high), M0),
2634 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2635 // timm:$attrchan, 0,
2636 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2637 // let Predicates = [has16BankLDS];
2638 // }
2639
2640 // 16 bank LDS
2641 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2642 N->getOperand(5), SDValue());
2643
2644 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2645
2646 SDNode *InterpMov =
2647 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2648 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2649 N->getOperand(3), // Attr
2650 N->getOperand(2), // Attrchan
2651 ToM0.getValue(1) // In glue
2652 });
2653
2654 SDNode *InterpP1LV =
2655 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2656 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2657 N->getOperand(1), // Src0
2658 N->getOperand(3), // Attr
2659 N->getOperand(2), // Attrchan
2660 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2661 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2662 N->getOperand(4), // high
2663 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2664 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2665 SDValue(InterpMov, 1)
2666 });
2667
2668 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2669}
2670
2671void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2672 unsigned IntrID = N->getConstantOperandVal(1);
2673 switch (IntrID) {
2674 case Intrinsic::amdgcn_ds_append:
2675 case Intrinsic::amdgcn_ds_consume: {
2676 if (N->getValueType(0) != MVT::i32)
2677 break;
2678 SelectDSAppendConsume(N, IntrID);
2679 return;
2680 }
2681 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2682 SelectDSBvhStackIntrinsic(N);
2683 return;
2684 }
2685
2686 SelectCode(N);
2687}
2688
2689void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2690 unsigned IntrID = N->getConstantOperandVal(0);
2691 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2692 SDNode *ConvGlueNode = N->getGluedNode();
2693 if (ConvGlueNode) {
2694 // FIXME: Possibly iterate over multiple glue nodes?
2695 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2696 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2697 ConvGlueNode =
2698 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2699 MVT::Glue, SDValue(ConvGlueNode, 0));
2700 } else {
2701 ConvGlueNode = nullptr;
2702 }
2703 switch (IntrID) {
2704 case Intrinsic::amdgcn_wqm:
2705 Opcode = AMDGPU::WQM;
2706 break;
2707 case Intrinsic::amdgcn_softwqm:
2708 Opcode = AMDGPU::SOFT_WQM;
2709 break;
2710 case Intrinsic::amdgcn_wwm:
2711 case Intrinsic::amdgcn_strict_wwm:
2712 Opcode = AMDGPU::STRICT_WWM;
2713 break;
2714 case Intrinsic::amdgcn_strict_wqm:
2715 Opcode = AMDGPU::STRICT_WQM;
2716 break;
2717 case Intrinsic::amdgcn_interp_p1_f16:
2718 SelectInterpP1F16(N);
2719 return;
2720 case Intrinsic::amdgcn_inverse_ballot:
2721 switch (N->getOperand(1).getValueSizeInBits()) {
2722 case 32:
2723 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2724 break;
2725 case 64:
2726 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2727 break;
2728 default:
2729 llvm_unreachable("Unsupported size for inverse ballot mask.");
2730 }
2731 break;
2732 default:
2733 SelectCode(N);
2734 break;
2735 }
2736
2737 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2738 SDValue Src = N->getOperand(1);
2739 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2740 }
2741
2742 if (ConvGlueNode) {
2743 SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2744 NewOps.push_back(SDValue(ConvGlueNode, 0));
2745 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2746 }
2747}
2748
2749void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2750 unsigned IntrID = N->getConstantOperandVal(1);
2751 switch (IntrID) {
2752 case Intrinsic::amdgcn_ds_gws_init:
2753 case Intrinsic::amdgcn_ds_gws_barrier:
2754 case Intrinsic::amdgcn_ds_gws_sema_v:
2755 case Intrinsic::amdgcn_ds_gws_sema_br:
2756 case Intrinsic::amdgcn_ds_gws_sema_p:
2757 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2758 SelectDS_GWS(N, IntrID);
2759 return;
2760 default:
2761 break;
2762 }
2763
2764 SelectCode(N);
2765}
2766
2767void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2768 SDValue Log2WaveSize =
2769 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2770 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2771 {N->getOperand(0), Log2WaveSize});
2772}
2773
2774void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2775 SDValue SrcVal = N->getOperand(1);
2776 if (SrcVal.getValueType() != MVT::i32) {
2777 SelectCode(N); // Emit default error
2778 return;
2779 }
2780
2781 SDValue CopyVal;
2783 SDLoc SL(N);
2784
2785 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2786 CopyVal = SrcVal.getOperand(0);
2787 } else {
2788 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2789 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2790
2791 if (N->isDivergent()) {
2792 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2793 MVT::i32, SrcVal),
2794 0);
2795 }
2796
2797 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2798 {SrcVal, Log2WaveSize}),
2799 0);
2800 }
2801
2802 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2803 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2804}
2805
2806bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2807 unsigned &Mods,
2808 bool IsCanonicalizing,
2809 bool AllowAbs) const {
2810 Mods = SISrcMods::NONE;
2811 Src = In;
2812
2813 if (Src.getOpcode() == ISD::FNEG) {
2814 Mods |= SISrcMods::NEG;
2815 Src = Src.getOperand(0);
2816 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2817 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2818 // denormal mode, but we're implicitly canonicalizing in a source operand.
2819 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2820 if (LHS && LHS->isZero()) {
2821 Mods |= SISrcMods::NEG;
2822 Src = Src.getOperand(1);
2823 }
2824 }
2825
2826 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2827 Mods |= SISrcMods::ABS;
2828 Src = Src.getOperand(0);
2829 }
2830
2831 return true;
2832}
2833
2834bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2835 SDValue &SrcMods) const {
2836 unsigned Mods;
2837 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2838 /*AllowAbs=*/true)) {
2839 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2840 return true;
2841 }
2842
2843 return false;
2844}
2845
2846bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2847 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2848 unsigned Mods;
2849 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2850 /*AllowAbs=*/true)) {
2851 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2852 return true;
2853 }
2854
2855 return false;
2856}
2857
2858bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2859 SDValue &SrcMods) const {
2860 unsigned Mods;
2861 if (SelectVOP3ModsImpl(In, Src, Mods,
2862 /*IsCanonicalizing=*/true,
2863 /*AllowAbs=*/false)) {
2864 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2865 return true;
2866 }
2867
2868 return false;
2869}
2870
2871bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2872 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2873 return false;
2874
2875 Src = In;
2876 return true;
2877}
2878
2879bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2880 SDValue &SrcMods,
2881 bool OpSel) const {
2882 unsigned Mods;
2883 if (SelectVOP3ModsImpl(In, Src, Mods,
2884 /*IsCanonicalizing=*/true,
2885 /*AllowAbs=*/false)) {
2886 if (OpSel)
2887 Mods |= SISrcMods::OP_SEL_0;
2888 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2889 return true;
2890 }
2891
2892 return false;
2893}
2894
2895bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2896 SDValue &SrcMods) const {
2897 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2898}
2899
2900bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2901 SDValue &SrcMods) const {
2902 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2903}
2904
2905bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2906 SDValue &SrcMods, SDValue &Clamp,
2907 SDValue &Omod) const {
2908 SDLoc DL(In);
2909 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2910 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2911
2912 return SelectVOP3Mods(In, Src, SrcMods);
2913}
2914
2915bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2916 SDValue &SrcMods, SDValue &Clamp,
2917 SDValue &Omod) const {
2918 SDLoc DL(In);
2919 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2920 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2921
2922 return SelectVOP3BMods(In, Src, SrcMods);
2923}
2924
2925bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2926 SDValue &Clamp, SDValue &Omod) const {
2927 Src = In;
2928
2929 SDLoc DL(In);
2930 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2931 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2932
2933 return true;
2934}
2935
2936bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2937 SDValue &SrcMods, bool IsDOT) const {
2938 unsigned Mods = SISrcMods::NONE;
2939 Src = In;
2940
2941 // TODO: Handle G_FSUB 0 as fneg
2942 if (Src.getOpcode() == ISD::FNEG) {
2944 Src = Src.getOperand(0);
2945 }
2946
2947 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2948 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2949 unsigned VecMods = Mods;
2950
2951 SDValue Lo = stripBitcast(Src.getOperand(0));
2952 SDValue Hi = stripBitcast(Src.getOperand(1));
2953
2954 if (Lo.getOpcode() == ISD::FNEG) {
2955 Lo = stripBitcast(Lo.getOperand(0));
2956 Mods ^= SISrcMods::NEG;
2957 }
2958
2959 if (Hi.getOpcode() == ISD::FNEG) {
2960 Hi = stripBitcast(Hi.getOperand(0));
2961 Mods ^= SISrcMods::NEG_HI;
2962 }
2963
2964 if (isExtractHiElt(Lo, Lo))
2965 Mods |= SISrcMods::OP_SEL_0;
2966
2967 if (isExtractHiElt(Hi, Hi))
2968 Mods |= SISrcMods::OP_SEL_1;
2969
2970 unsigned VecSize = Src.getValueSizeInBits();
2971 Lo = stripExtractLoElt(Lo);
2972 Hi = stripExtractLoElt(Hi);
2973
2974 if (Lo.getValueSizeInBits() > VecSize) {
2976 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2977 MVT::getIntegerVT(VecSize), Lo);
2978 }
2979
2980 if (Hi.getValueSizeInBits() > VecSize) {
2982 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2983 MVT::getIntegerVT(VecSize), Hi);
2984 }
2985
2986 assert(Lo.getValueSizeInBits() <= VecSize &&
2987 Hi.getValueSizeInBits() <= VecSize);
2988
2989 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2990 // Really a scalar input. Just select from the low half of the register to
2991 // avoid packing.
2992
2993 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2994 Src = Lo;
2995 } else {
2996 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2997
2998 SDLoc SL(In);
3000 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3001 Lo.getValueType()), 0);
3002 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3003 : AMDGPU::SReg_64RegClassID;
3004 const SDValue Ops[] = {
3005 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3006 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3007 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3008
3009 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3010 Src.getValueType(), Ops), 0);
3011 }
3012 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3013 return true;
3014 }
3015
3016 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3017 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3018 .bitcastToAPInt().getZExtValue();
3019 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3020 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3021 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3022 return true;
3023 }
3024 }
3025
3026 Mods = VecMods;
3027 }
3028
3029 // Packed instructions do not have abs modifiers.
3030 Mods |= SISrcMods::OP_SEL_1;
3031
3032 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3033 return true;
3034}
3035
3036bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3037 SDValue &SrcMods) const {
3038 return SelectVOP3PMods(In, Src, SrcMods, true);
3039}
3040
3041bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3042 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3043 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3044 // 1 promotes packed values to signed, 0 treats them as unsigned.
3045 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3046
3047 unsigned Mods = SISrcMods::OP_SEL_1;
3048 unsigned SrcSign = C->getZExtValue();
3049 if (SrcSign == 1)
3050 Mods ^= SISrcMods::NEG;
3051
3052 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3053 return true;
3054}
3055
3056bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3057 SDValue &Src) const {
3058 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3059 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3060
3061 unsigned Mods = SISrcMods::OP_SEL_1;
3062 unsigned SrcVal = C->getZExtValue();
3063 if (SrcVal == 1)
3064 Mods |= SISrcMods::OP_SEL_0;
3065
3066 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3067 return true;
3068}
3069
3071 llvm::SelectionDAG *CurDAG,
3072 const SDLoc &DL) {
3073 unsigned DstRegClass;
3074 EVT DstTy;
3075 switch (Elts.size()) {
3076 case 8:
3077 DstRegClass = AMDGPU::VReg_256RegClassID;
3078 DstTy = MVT::v8i32;
3079 break;
3080 case 4:
3081 DstRegClass = AMDGPU::VReg_128RegClassID;
3082 DstTy = MVT::v4i32;
3083 break;
3084 case 2:
3085 DstRegClass = AMDGPU::VReg_64RegClassID;
3086 DstTy = MVT::v2i32;
3087 break;
3088 default:
3089 llvm_unreachable("unhandled Reg sequence size");
3090 }
3091
3093 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3094 for (unsigned i = 0; i < Elts.size(); ++i) {
3095 Ops.push_back(Elts[i]);
3096 Ops.push_back(CurDAG->getTargetConstant(
3098 }
3099 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3100}
3101
3103 llvm::SelectionDAG *CurDAG,
3104 const SDLoc &DL) {
3105 SmallVector<SDValue, 8> PackedElts;
3106 assert("unhandled Reg sequence size" &&
3107 (Elts.size() == 8 || Elts.size() == 16));
3108
3109 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3110 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3111 for (unsigned i = 0; i < Elts.size(); i += 2) {
3112 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3113 SDValue HiSrc;
3114 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3115 PackedElts.push_back(HiSrc);
3116 } else {
3117 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3118 MachineSDNode *Packed =
3119 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3120 {Elts[i + 1], Elts[i], PackLoLo});
3121 PackedElts.push_back(SDValue(Packed, 0));
3122 }
3123 }
3124
3125 return buildRegSequence32(PackedElts, CurDAG, DL);
3126}
3127
3129 llvm::SelectionDAG *CurDAG,
3130 const SDLoc &DL, unsigned ElementSize) {
3131 if (ElementSize == 16)
3132 return buildRegSequence16(Elts, CurDAG, DL);
3133 if (ElementSize == 32)
3134 return buildRegSequence32(Elts, CurDAG, DL);
3135 llvm_unreachable("Unhandled element size");
3136}
3137
3138static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3140 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3141 unsigned ElementSize) {
3142 if (ModOpcode == ISD::FNEG) {
3143 Mods |= SISrcMods::NEG;
3144 // Check if all elements also have abs modifier
3145 SmallVector<SDValue, 8> NegAbsElts;
3146 for (auto El : Elts) {
3147 if (El.getOpcode() != ISD::FABS)
3148 break;
3149 NegAbsElts.push_back(El->getOperand(0));
3150 }
3151 if (Elts.size() != NegAbsElts.size()) {
3152 // Neg
3153 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3154 } else {
3155 // Neg and Abs
3156 Mods |= SISrcMods::NEG_HI;
3157 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3158 }
3159 } else {
3160 assert(ModOpcode == ISD::FABS);
3161 // Abs
3162 Mods |= SISrcMods::NEG_HI;
3163 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3164 }
3165}
3166
3167// Check all f16 elements for modifiers while looking through b32 and v2b16
3168// build vector, stop if element does not satisfy ModifierCheck.
3169static void
3171 std::function<bool(SDValue)> ModifierCheck) {
3172 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3173 if (auto *F16Pair =
3174 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3175 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3176 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3177 if (!ModifierCheck(ElF16))
3178 break;
3179 }
3180 }
3181 }
3182}
3183
3184bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3185 SDValue &SrcMods) const {
3186 Src = In;
3187 unsigned Mods = SISrcMods::OP_SEL_1;
3188
3189 // mods are on f16 elements
3190 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3192
3193 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3194 if (Element.getOpcode() != ISD::FNEG)
3195 return false;
3196 EltsF16.push_back(Element.getOperand(0));
3197 return true;
3198 });
3199
3200 // All elements have neg modifier
3201 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3202 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3203 Mods |= SISrcMods::NEG;
3204 Mods |= SISrcMods::NEG_HI;
3205 }
3206 }
3207
3208 // mods are on v2f16 elements
3209 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3210 SmallVector<SDValue, 8> EltsV2F16;
3211 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3212 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3213 // Based on first element decide which mod we match, neg or abs
3214 if (ElV2f16.getOpcode() != ISD::FNEG)
3215 break;
3216 EltsV2F16.push_back(ElV2f16.getOperand(0));
3217 }
3218
3219 // All pairs of elements have neg modifier
3220 if (BV->getNumOperands() == EltsV2F16.size()) {
3221 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3222 Mods |= SISrcMods::NEG;
3223 Mods |= SISrcMods::NEG_HI;
3224 }
3225 }
3226
3227 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3228 return true;
3229}
3230
3231bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3232 SDValue &SrcMods) const {
3233 Src = In;
3234 unsigned Mods = SISrcMods::OP_SEL_1;
3235 unsigned ModOpcode;
3236
3237 // mods are on f16 elements
3238 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3240 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3241 // Based on first element decide which mod we match, neg or abs
3242 if (EltsF16.empty())
3243 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3244 if (ElF16.getOpcode() != ModOpcode)
3245 return false;
3246 EltsF16.push_back(ElF16.getOperand(0));
3247 return true;
3248 });
3249
3250 // All elements have ModOpcode modifier
3251 if (BV->getNumOperands() * 2 == EltsF16.size())
3252 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3253 16);
3254 }
3255
3256 // mods are on v2f16 elements
3257 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3258 SmallVector<SDValue, 8> EltsV2F16;
3259
3260 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3261 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3262 // Based on first element decide which mod we match, neg or abs
3263 if (EltsV2F16.empty())
3264 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3265 if (ElV2f16->getOpcode() != ModOpcode)
3266 break;
3267 EltsV2F16.push_back(ElV2f16->getOperand(0));
3268 }
3269
3270 // All elements have ModOpcode modifier
3271 if (BV->getNumOperands() == EltsV2F16.size())
3272 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3273 32);
3274 }
3275
3276 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3277 return true;
3278}
3279
3280bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3281 SDValue &SrcMods) const {
3282 Src = In;
3283 unsigned Mods = SISrcMods::OP_SEL_1;
3285
3286 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3287 assert(BV->getNumOperands() > 0);
3288 // Based on first element decide which mod we match, neg or abs
3289 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3290 unsigned ModOpcode =
3291 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3292 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3293 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3294 if (ElF32.getOpcode() != ModOpcode)
3295 break;
3296 EltsF32.push_back(ElF32.getOperand(0));
3297 }
3298
3299 // All elements had ModOpcode modifier
3300 if (BV->getNumOperands() == EltsF32.size())
3301 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3302 32);
3303 }
3304
3305 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3306 return true;
3307}
3308
3309bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3310 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3311 BitVector UndefElements;
3312 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3313 if (isInlineImmediate(Splat.getNode())) {
3314 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3315 unsigned Imm = C->getAPIntValue().getSExtValue();
3316 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3317 return true;
3318 }
3319 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3320 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3321 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3322 return true;
3323 }
3324 llvm_unreachable("unhandled Constant node");
3325 }
3326 }
3327
3328 // 16 bit splat
3329 SDValue SplatSrc32 = stripBitcast(In);
3330 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3331 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3332 SDValue SplatSrc16 = stripBitcast(Splat32);
3333 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3334 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3335 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3336 std::optional<APInt> RawValue;
3337 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3338 RawValue = C->getValueAPF().bitcastToAPInt();
3339 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3340 RawValue = C->getAPIntValue();
3341
3342 if (RawValue.has_value()) {
3343 EVT VT = In.getValueType().getScalarType();
3344 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3345 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3348 RawValue.value());
3349 if (TII->isInlineConstant(FloatVal)) {
3350 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3351 MVT::i16);
3352 return true;
3353 }
3354 } else if (VT.getSimpleVT() == MVT::i16) {
3355 if (TII->isInlineConstant(RawValue.value())) {
3356 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3357 MVT::i16);
3358 return true;
3359 }
3360 } else
3361 llvm_unreachable("unknown 16-bit type");
3362 }
3363 }
3364 }
3365
3366 return false;
3367}
3368
3369bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3370 SDValue &IndexKey) const {
3371 unsigned Key = 0;
3372 Src = In;
3373
3374 if (In.getOpcode() == ISD::SRL) {
3375 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3376 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3377 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3378 ShiftAmt->getZExtValue() % 8 == 0) {
3379 Key = ShiftAmt->getZExtValue() / 8;
3380 Src = ShiftSrc;
3381 }
3382 }
3383
3384 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3385 return true;
3386}
3387
3388bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3389 SDValue &IndexKey) const {
3390 unsigned Key = 0;
3391 Src = In;
3392
3393 if (In.getOpcode() == ISD::SRL) {
3394 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3395 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3396 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3397 ShiftAmt->getZExtValue() == 16) {
3398 Key = 1;
3399 Src = ShiftSrc;
3400 }
3401 }
3402
3403 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3404 return true;
3405}
3406
3407bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3408 SDValue &SrcMods) const {
3409 Src = In;
3410 // FIXME: Handle op_sel
3411 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3412 return true;
3413}
3414
3415bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3416 SDValue &SrcMods) const {
3417 // FIXME: Handle op_sel
3418 return SelectVOP3Mods(In, Src, SrcMods);
3419}
3420
3421// The return value is not whether the match is possible (which it always is),
3422// but whether or not it a conversion is really used.
3423bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3424 unsigned &Mods) const {
3425 Mods = 0;
3426 SelectVOP3ModsImpl(In, Src, Mods);
3427
3428 if (Src.getOpcode() == ISD::FP_EXTEND) {
3429 Src = Src.getOperand(0);
3430 assert(Src.getValueType() == MVT::f16);
3431 Src = stripBitcast(Src);
3432
3433 // Be careful about folding modifiers if we already have an abs. fneg is
3434 // applied last, so we don't want to apply an earlier fneg.
3435 if ((Mods & SISrcMods::ABS) == 0) {
3436 unsigned ModsTmp;
3437 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3438
3439 if ((ModsTmp & SISrcMods::NEG) != 0)
3440 Mods ^= SISrcMods::NEG;
3441
3442 if ((ModsTmp & SISrcMods::ABS) != 0)
3443 Mods |= SISrcMods::ABS;
3444 }
3445
3446 // op_sel/op_sel_hi decide the source type and source.
3447 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3448 // If the sources's op_sel is set, it picks the high half of the source
3449 // register.
3450
3451 Mods |= SISrcMods::OP_SEL_1;
3452 if (isExtractHiElt(Src, Src)) {
3453 Mods |= SISrcMods::OP_SEL_0;
3454
3455 // TODO: Should we try to look for neg/abs here?
3456 }
3457
3458 return true;
3459 }
3460
3461 return false;
3462}
3463
3464bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3465 SDValue &SrcMods) const {
3466 unsigned Mods = 0;
3467 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3468 return false;
3469 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3470 return true;
3471}
3472
3473bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3474 SDValue &SrcMods) const {
3475 unsigned Mods = 0;
3476 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3477 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3478 return true;
3479}
3480
3481SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3482 if (In.isUndef())
3483 return CurDAG->getUNDEF(MVT::i32);
3484
3485 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3486 SDLoc SL(In);
3487 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3488 }
3489
3490 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3491 SDLoc SL(In);
3492 return CurDAG->getConstant(
3493 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3494 }
3495
3496 SDValue Src;
3497 if (isExtractHiElt(In, Src))
3498 return Src;
3499
3500 return SDValue();
3501}
3502
3503bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3505
3506 const SIRegisterInfo *SIRI =
3507 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3508 const SIInstrInfo * SII =
3509 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3510
3511 unsigned Limit = 0;
3512 bool AllUsesAcceptSReg = true;
3513 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3514 Limit < 10 && U != E; ++U, ++Limit) {
3515 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3516
3517 // If the register class is unknown, it could be an unknown
3518 // register class that needs to be an SGPR, e.g. an inline asm
3519 // constraint
3520 if (!RC || SIRI->isSGPRClass(RC))
3521 return false;
3522
3523 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3524 AllUsesAcceptSReg = false;
3525 SDNode * User = *U;
3526 if (User->isMachineOpcode()) {
3527 unsigned Opc = User->getMachineOpcode();
3528 const MCInstrDesc &Desc = SII->get(Opc);
3529 if (Desc.isCommutable()) {
3530 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3531 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3532 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3533 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3534 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3535 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3536 CommutedRC == &AMDGPU::VS_64RegClass)
3537 AllUsesAcceptSReg = true;
3538 }
3539 }
3540 }
3541 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3542 // commuting current user. This means have at least one use
3543 // that strictly require VGPR. Thus, we will not attempt to commute
3544 // other user instructions.
3545 if (!AllUsesAcceptSReg)
3546 break;
3547 }
3548 }
3549 return !AllUsesAcceptSReg && (Limit < 10);
3550}
3551
3552bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3553 auto Ld = cast<LoadSDNode>(N);
3554
3555 const MachineMemOperand *MMO = Ld->getMemOperand();
3556 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3557 return false;
3558
3559 return MMO->getSize().hasValue() &&
3560 Ld->getAlign() >=
3561 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3562 uint64_t(4))) &&
3563 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3564 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3565 (Subtarget->getScalarizeGlobalBehavior() &&
3566 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3567 Ld->isSimple() &&
3568 static_cast<const SITargetLowering *>(getTargetLowering())
3569 ->isMemOpHasNoClobberedMemOperand(N)));
3570}
3571
3574 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3575 bool IsModified = false;
3576 do {
3577 IsModified = false;
3578
3579 // Go over all selected nodes and try to fold them a bit more
3581 while (Position != CurDAG->allnodes_end()) {
3582 SDNode *Node = &*Position++;
3583 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3584 if (!MachineNode)
3585 continue;
3586
3587 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3588 if (ResNode != Node) {
3589 if (ResNode)
3590 ReplaceUses(Node, ResNode);
3591 IsModified = true;
3592 }
3593 }
3595 } while (IsModified);
3596}
3597
3598char AMDGPUDAGToDAGISel::ID = 0;
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static MachineSDNode * buildRegSequence(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool isNoUnsignedWrap(SDValue Addr)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< SDValue > &Elts, SDValue &Src, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:326
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:454
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:458
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:613
bool hasDLInsts() const
Definition: GCNSubtarget.h:754
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:252
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:539
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:264
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:681
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:669
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:916
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:691
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:517
Generation getGeneration() const
Definition: GCNSubtarget.h:303
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:705
bool hasAddr64() const
Definition: GCNSubtarget.h:367
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:713
bool hasSALUFloatInsts() const
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:473
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:530
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:531
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:726
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:472
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:772
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:675
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:468
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:553
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:533
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:361
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1126
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:269
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1396
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ TargetFrameIndex
Definition: ISDOpcodes.h:166
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:279
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1070
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1594
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:240
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
unsigned M0(unsigned Val)
Definition: VE.h:375
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:248
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.