LLVM 18.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80// Look through operations that obscure just looking at the low 16-bits of the
81// same register.
82static SDValue stripExtractLoElt(SDValue In) {
83 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
84 SDValue Idx = In.getOperand(1);
85 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
86 return In.getOperand(0);
87 }
88
89 if (In.getOpcode() == ISD::TRUNCATE) {
90 SDValue Src = In.getOperand(0);
91 if (Src.getValueType().getSizeInBits() == 32)
92 return stripBitcast(Src);
93 }
94
95 return In;
96}
97
98} // end anonymous namespace
99
101 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
105#ifdef EXPENSIVE_CHECKS
108#endif
110 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
111
112/// This pass converts a legalized DAG into a AMDGPU-specific
113// DAG, ready for instruction scheduling.
115 CodeGenOptLevel OptLevel) {
116 return new AMDGPUDAGToDAGISel(TM, OptLevel);
117}
118
120 CodeGenOptLevel OptLevel)
121 : SelectionDAGISel(ID, TM, OptLevel) {
122 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
123}
124
126#ifdef EXPENSIVE_CHECKS
127 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
128 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
129 for (auto &L : LI->getLoopsInPreorder()) {
130 assert(L->isLCSSAForm(DT));
131 }
132#endif
133 Subtarget = &MF.getSubtarget<GCNSubtarget>();
136}
137
138bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
139 // XXX - only need to list legal operations.
140 switch (Opc) {
141 case ISD::FADD:
142 case ISD::FSUB:
143 case ISD::FMUL:
144 case ISD::FDIV:
145 case ISD::FREM:
147 case ISD::UINT_TO_FP:
148 case ISD::SINT_TO_FP:
149 case ISD::FABS:
150 // Fabs is lowered to a bit operation, but it's an and which will clear the
151 // high bits anyway.
152 case ISD::FSQRT:
153 case ISD::FSIN:
154 case ISD::FCOS:
155 case ISD::FPOWI:
156 case ISD::FPOW:
157 case ISD::FLOG:
158 case ISD::FLOG2:
159 case ISD::FLOG10:
160 case ISD::FEXP:
161 case ISD::FEXP2:
162 case ISD::FCEIL:
163 case ISD::FTRUNC:
164 case ISD::FRINT:
165 case ISD::FNEARBYINT:
166 case ISD::FROUND:
167 case ISD::FFLOOR:
168 case ISD::FMINNUM:
169 case ISD::FMAXNUM:
170 case ISD::FLDEXP:
171 case AMDGPUISD::FRACT:
172 case AMDGPUISD::CLAMP:
175 case AMDGPUISD::FMIN3:
176 case AMDGPUISD::FMAX3:
177 case AMDGPUISD::FMED3:
179 case AMDGPUISD::RCP:
180 case AMDGPUISD::RSQ:
182 // On gfx10, all 16-bit instructions preserve the high bits.
183 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
184 case ISD::FP_ROUND:
185 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
186 // high bits on gfx9.
187 // TODO: If we had the source node we could see if the source was fma/mad
189 case ISD::FMA:
190 case ISD::FMAD:
193 default:
194 // fcopysign, select and others may be lowered to 32-bit bit operations
195 // which don't zero the high bits.
196 return false;
197 }
198}
199
203#ifdef EXPENSIVE_CHECKS
206#endif
208}
209
211 assert(Subtarget->d16PreservesUnusedBits());
212 MVT VT = N->getValueType(0).getSimpleVT();
213 if (VT != MVT::v2i16 && VT != MVT::v2f16)
214 return false;
215
216 SDValue Lo = N->getOperand(0);
217 SDValue Hi = N->getOperand(1);
218
219 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
220
221 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
222 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
223 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
224
225 // Need to check for possible indirect dependencies on the other half of the
226 // vector to avoid introducing a cycle.
227 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
228 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
229
231 SDValue Ops[] = {
232 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
233 };
234
235 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
236 if (LdHi->getMemoryVT() == MVT::i8) {
237 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
239 } else {
240 assert(LdHi->getMemoryVT() == MVT::i16);
241 }
242
243 SDValue NewLoadHi =
244 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
245 Ops, LdHi->getMemoryVT(),
246 LdHi->getMemOperand());
247
248 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
249 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
250 return true;
251 }
252
253 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
254 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
255 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
256 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
257 if (LdLo && Lo.hasOneUse()) {
258 SDValue TiedIn = getHi16Elt(Hi);
259 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
260 return false;
261
262 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
263 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
264 if (LdLo->getMemoryVT() == MVT::i8) {
265 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
267 } else {
268 assert(LdLo->getMemoryVT() == MVT::i16);
269 }
270
271 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
272
273 SDValue Ops[] = {
274 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
275 };
276
277 SDValue NewLoadLo =
278 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
279 Ops, LdLo->getMemoryVT(),
280 LdLo->getMemOperand());
281
282 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
283 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
284 return true;
285 }
286
287 return false;
288}
289
291 if (!Subtarget->d16PreservesUnusedBits())
292 return;
293
295
296 bool MadeChange = false;
297 while (Position != CurDAG->allnodes_begin()) {
298 SDNode *N = &*--Position;
299 if (N->use_empty())
300 continue;
301
302 switch (N->getOpcode()) {
304 MadeChange |= matchLoadD16FromBuildVector(N);
305 break;
306 default:
307 break;
308 }
309 }
310
311 if (MadeChange) {
313 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
314 CurDAG->dump(););
315 }
316}
317
318bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
319 bool Negated) const {
320 if (N->isUndef())
321 return true;
322
323 const SIInstrInfo *TII = Subtarget->getInstrInfo();
324 if (Negated) {
325 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
326 return TII->isInlineConstant(-C->getAPIntValue());
327
328 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
329 return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
330
331 } else {
332 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
333 return TII->isInlineConstant(C->getAPIntValue());
334
335 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
336 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
337 }
338
339 return false;
340}
341
342/// Determine the register class for \p OpNo
343/// \returns The register class of the virtual register that will be used for
344/// the given operand number \OpNo or NULL if the register class cannot be
345/// determined.
346const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347 unsigned OpNo) const {
348 if (!N->isMachineOpcode()) {
349 if (N->getOpcode() == ISD::CopyToReg) {
350 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351 if (Reg.isVirtual()) {
353 return MRI.getRegClass(Reg);
354 }
355
356 const SIRegisterInfo *TRI
357 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358 return TRI->getPhysRegBaseClass(Reg);
359 }
360
361 return nullptr;
362 }
363
364 switch (N->getMachineOpcode()) {
365 default: {
366 const MCInstrDesc &Desc =
367 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368 unsigned OpIdx = Desc.getNumDefs() + OpNo;
369 if (OpIdx >= Desc.getNumOperands())
370 return nullptr;
371 int RegClass = Desc.operands()[OpIdx].RegClass;
372 if (RegClass == -1)
373 return nullptr;
374
375 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376 }
377 case AMDGPU::REG_SEQUENCE: {
378 unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
379 const TargetRegisterClass *SuperRC =
380 Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382 SDValue SubRegOp = N->getOperand(OpNo + 1);
383 unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
384 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385 SubRegIdx);
386 }
387 }
388}
389
390SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391 SDValue Glue) const {
392 SmallVector <SDValue, 8> Ops;
393 Ops.push_back(NewChain); // Replace the chain.
394 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395 Ops.push_back(N->getOperand(i));
396
397 Ops.push_back(Glue);
398 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399}
400
401SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
403 *static_cast<const SITargetLowering*>(getTargetLowering());
404
405 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406
407 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408 return glueCopyToOp(N, M0, M0.getValue(1));
409}
410
411SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414 if (Subtarget->ldsRequiresM0Init())
415 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
418 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419 return
420 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421 }
422 return N;
423}
424
425MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426 EVT VT) const {
428 AMDGPU::S_MOV_B32, DL, MVT::i32,
429 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430 SDNode *Hi =
431 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433 const SDValue Ops[] = {
434 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437
438 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439}
440
441void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442 EVT VT = N->getValueType(0);
443 unsigned NumVectorElts = VT.getVectorNumElements();
444 EVT EltVT = VT.getVectorElementType();
445 SDLoc DL(N);
446 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447
448 if (NumVectorElts == 1) {
449 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450 RegClass);
451 return;
452 }
453
454 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455 "supported yet");
456 // 32 = Max Num Vector Elements
457 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458 // 1 = Vector Register Class
459 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460
461 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
463 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464 bool IsRegSeq = true;
465 unsigned NOps = N->getNumOperands();
466 for (unsigned i = 0; i < NOps; i++) {
467 // XXX: Why is this here?
468 if (isa<RegisterSDNode>(N->getOperand(i))) {
469 IsRegSeq = false;
470 break;
471 }
472 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
474 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476 }
477 if (NOps != NumVectorElts) {
478 // Fill in the missing undef elements if this was a scalar_to_vector.
479 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481 DL, EltVT);
482 for (unsigned i = NOps; i < NumVectorElts; ++i) {
483 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
485 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486 RegSeqArgs[1 + (2 * i) + 1] =
487 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488 }
489 }
490
491 if (!IsRegSeq)
492 SelectCode(N);
493 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494}
495
497 unsigned int Opc = N->getOpcode();
498 if (N->isMachineOpcode()) {
499 N->setNodeId(-1);
500 return; // Already selected.
501 }
502
503 // isa<MemSDNode> almost works but is slightly too permissive for some DS
504 // intrinsics.
505 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
508 N = glueCopyToM0LDSInit(N);
509 SelectCode(N);
510 return;
511 }
512
513 switch (Opc) {
514 default:
515 break;
516 // We are selecting i64 ADD here instead of custom lower it during
517 // DAG legalization, so we can fold some i64 ADDs used for address
518 // calculation into the LOAD and STORE instructions.
519 case ISD::ADDC:
520 case ISD::ADDE:
521 case ISD::SUBC:
522 case ISD::SUBE: {
523 if (N->getValueType(0) != MVT::i64)
524 break;
525
526 SelectADD_SUB_I64(N);
527 return;
528 }
529 case ISD::UADDO_CARRY:
530 case ISD::USUBO_CARRY:
531 if (N->getValueType(0) != MVT::i32)
532 break;
533
534 SelectAddcSubb(N);
535 return;
536 case ISD::UADDO:
537 case ISD::USUBO: {
538 SelectUADDO_USUBO(N);
539 return;
540 }
542 SelectFMUL_W_CHAIN(N);
543 return;
544 }
546 SelectFMA_W_CHAIN(N);
547 return;
548 }
549
551 case ISD::BUILD_VECTOR: {
552 EVT VT = N->getValueType(0);
553 unsigned NumVectorElts = VT.getVectorNumElements();
554 if (VT.getScalarSizeInBits() == 16) {
555 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
556 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
557 ReplaceNode(N, Packed);
558 return;
559 }
560 }
561
562 break;
563 }
564
565 assert(VT.getVectorElementType().bitsEq(MVT::i32));
566 unsigned RegClassID =
567 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
568 SelectBuildVector(N, RegClassID);
569 return;
570 }
571 case ISD::BUILD_PAIR: {
572 SDValue RC, SubReg0, SubReg1;
573 SDLoc DL(N);
574 if (N->getValueType(0) == MVT::i128) {
575 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
576 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
577 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
578 } else if (N->getValueType(0) == MVT::i64) {
579 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
580 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
581 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
582 } else {
583 llvm_unreachable("Unhandled value type for BUILD_PAIR");
584 }
585 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
586 N->getOperand(1), SubReg1 };
587 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
588 N->getValueType(0), Ops));
589 return;
590 }
591
592 case ISD::Constant:
593 case ISD::ConstantFP: {
594 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
595 break;
596
597 uint64_t Imm;
598 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
599 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
600 else {
601 ConstantSDNode *C = cast<ConstantSDNode>(N);
602 Imm = C->getZExtValue();
603 }
604
605 SDLoc DL(N);
606 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
607 return;
608 }
610 case AMDGPUISD::BFE_U32: {
611 // There is a scalar version available, but unlike the vector version which
612 // has a separate operand for the offset and width, the scalar version packs
613 // the width and offset into a single operand. Try to move to the scalar
614 // version if the offsets are constant, so that we can try to keep extended
615 // loads of kernel arguments in SGPRs.
616
617 // TODO: Technically we could try to pattern match scalar bitshifts of
618 // dynamic values, but it's probably not useful.
619 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
620 if (!Offset)
621 break;
622
623 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
624 if (!Width)
625 break;
626
627 bool Signed = Opc == AMDGPUISD::BFE_I32;
628
629 uint32_t OffsetVal = Offset->getZExtValue();
630 uint32_t WidthVal = Width->getZExtValue();
631
632 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
633 WidthVal));
634 return;
635 }
637 SelectDIV_SCALE(N);
638 return;
639 }
642 SelectMAD_64_32(N);
643 return;
644 }
645 case ISD::SMUL_LOHI:
646 case ISD::UMUL_LOHI:
647 return SelectMUL_LOHI(N);
648 case ISD::CopyToReg: {
650 *static_cast<const SITargetLowering*>(getTargetLowering());
651 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
652 break;
653 }
654 case ISD::AND:
655 case ISD::SRL:
656 case ISD::SRA:
658 if (N->getValueType(0) != MVT::i32)
659 break;
660
661 SelectS_BFE(N);
662 return;
663 case ISD::BRCOND:
664 SelectBRCOND(N);
665 return;
666 case ISD::FP_EXTEND:
667 SelectFP_EXTEND(N);
668 return;
674 // Hack around using a legal type if f16 is illegal.
675 if (N->getValueType(0) == MVT::i32) {
676 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
677 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
678 { N->getOperand(0), N->getOperand(1) });
679 SelectCode(N);
680 return;
681 }
682
683 break;
684 }
686 SelectINTRINSIC_W_CHAIN(N);
687 return;
688 }
690 SelectINTRINSIC_WO_CHAIN(N);
691 return;
692 }
693 case ISD::INTRINSIC_VOID: {
694 SelectINTRINSIC_VOID(N);
695 return;
696 }
698 SelectWAVE_ADDRESS(N);
699 return;
700 }
701 case ISD::STACKRESTORE: {
702 SelectSTACKRESTORE(N);
703 return;
704 }
705 }
706
707 SelectCode(N);
708}
709
710bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
711 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
712 const Instruction *Term = BB->getTerminator();
713 return Term->getMetadata("amdgpu.uniform") ||
714 Term->getMetadata("structurizecfg.uniform");
715}
716
717bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
718 unsigned ShAmtBits) const {
719 assert(N->getOpcode() == ISD::AND);
720
721 const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
722 if (RHS.countr_one() >= ShAmtBits)
723 return true;
724
725 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
726 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
727}
728
730 SDValue &N0, SDValue &N1) {
731 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
732 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
733 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
734 // (i64 (bitcast (v2i32 (build_vector
735 // (or (extract_vector_elt V, 0), OFFSET),
736 // (extract_vector_elt V, 1)))))
737 SDValue Lo = Addr.getOperand(0).getOperand(0);
738 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
739 SDValue BaseLo = Lo.getOperand(0);
740 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
741 // Check that split base (Lo and Hi) are extracted from the same one.
742 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
744 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
745 // Lo is statically extracted from index 0.
746 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
747 BaseLo.getConstantOperandVal(1) == 0 &&
748 // Hi is statically extracted from index 0.
749 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
750 BaseHi.getConstantOperandVal(1) == 1) {
751 N0 = BaseLo.getOperand(0).getOperand(0);
752 N1 = Lo.getOperand(1);
753 return true;
754 }
755 }
756 }
757 return false;
758}
759
760bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
761 SDValue &RHS) const {
763 LHS = Addr.getOperand(0);
764 RHS = Addr.getOperand(1);
765 return true;
766 }
767
768 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
769 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
770 return true;
771 }
772
773 return false;
774}
775
777 return "AMDGPU DAG->DAG Pattern Instruction Selection";
778}
779
780//===----------------------------------------------------------------------===//
781// Complex Patterns
782//===----------------------------------------------------------------------===//
783
784bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
785 SDValue &Offset) {
786 return false;
787}
788
789bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
790 SDValue &Offset) {
792 SDLoc DL(Addr);
793
794 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
795 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
796 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
797 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
798 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
799 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
800 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
801 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
802 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
803 Base = Addr.getOperand(0);
804 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
805 } else {
806 Base = Addr;
807 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
808 }
809
810 return true;
811}
812
813SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
814 const SDLoc &DL) const {
816 AMDGPU::S_MOV_B32, DL, MVT::i32,
817 CurDAG->getTargetConstant(Val, DL, MVT::i32));
818 return SDValue(Mov, 0);
819}
820
821// FIXME: Should only handle uaddo_carry/usubo_carry
822void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
823 SDLoc DL(N);
824 SDValue LHS = N->getOperand(0);
825 SDValue RHS = N->getOperand(1);
826
827 unsigned Opcode = N->getOpcode();
828 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
829 bool ProduceCarry =
830 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
831 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
832
833 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
834 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
835
836 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
837 DL, MVT::i32, LHS, Sub0);
838 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
839 DL, MVT::i32, LHS, Sub1);
840
841 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
842 DL, MVT::i32, RHS, Sub0);
843 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
844 DL, MVT::i32, RHS, Sub1);
845
846 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
847
848 static const unsigned OpcMap[2][2][2] = {
849 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
850 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
851 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
852 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
853
854 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
855 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
856
857 SDNode *AddLo;
858 if (!ConsumeCarry) {
859 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
860 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
861 } else {
862 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
863 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
864 }
865 SDValue AddHiArgs[] = {
866 SDValue(Hi0, 0),
867 SDValue(Hi1, 0),
868 SDValue(AddLo, 1)
869 };
870 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
871
872 SDValue RegSequenceArgs[] = {
873 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
874 SDValue(AddLo,0),
875 Sub0,
876 SDValue(AddHi,0),
877 Sub1,
878 };
879 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
880 MVT::i64, RegSequenceArgs);
881
882 if (ProduceCarry) {
883 // Replace the carry-use
884 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
885 }
886
887 // Replace the remaining uses.
888 ReplaceNode(N, RegSequence);
889}
890
891void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
892 SDLoc DL(N);
893 SDValue LHS = N->getOperand(0);
894 SDValue RHS = N->getOperand(1);
895 SDValue CI = N->getOperand(2);
896
897 if (N->isDivergent()) {
898 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
899 : AMDGPU::V_SUBB_U32_e64;
901 N, Opc, N->getVTList(),
902 {LHS, RHS, CI,
903 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
904 } else {
905 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
906 : AMDGPU::S_SUB_CO_PSEUDO;
907 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
908 }
909}
910
911void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
912 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
913 // carry out despite the _i32 name. These were renamed in VI to _U32.
914 // FIXME: We should probably rename the opcodes here.
915 bool IsAdd = N->getOpcode() == ISD::UADDO;
916 bool IsVALU = N->isDivergent();
917
918 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
919 ++UI)
920 if (UI.getUse().getResNo() == 1) {
921 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
922 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
923 IsVALU = true;
924 break;
925 }
926 }
927
928 if (IsVALU) {
929 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
930
932 N, Opc, N->getVTList(),
933 {N->getOperand(0), N->getOperand(1),
934 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
935 } else {
936 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
937 : AMDGPU::S_USUBO_PSEUDO;
938
939 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
940 {N->getOperand(0), N->getOperand(1)});
941 }
942}
943
944void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
945 SDLoc SL(N);
946 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
947 SDValue Ops[10];
948
949 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
950 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
951 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
952 Ops[8] = N->getOperand(0);
953 Ops[9] = N->getOperand(4);
954
955 // If there are no source modifiers, prefer fmac over fma because it can use
956 // the smaller VOP2 encoding.
957 bool UseFMAC = Subtarget->hasDLInsts() &&
958 cast<ConstantSDNode>(Ops[0])->isZero() &&
959 cast<ConstantSDNode>(Ops[2])->isZero() &&
960 cast<ConstantSDNode>(Ops[4])->isZero();
961 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
962 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
963}
964
965void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
966 SDLoc SL(N);
967 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
968 SDValue Ops[8];
969
970 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
971 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
972 Ops[6] = N->getOperand(0);
973 Ops[7] = N->getOperand(3);
974
975 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
976}
977
978// We need to handle this here because tablegen doesn't support matching
979// instructions with multiple outputs.
980void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
981 SDLoc SL(N);
982 EVT VT = N->getValueType(0);
983
984 assert(VT == MVT::f32 || VT == MVT::f64);
985
986 unsigned Opc
987 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
988
989 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
990 // omod
991 SDValue Ops[8];
992 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
993 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
994 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
995 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
996}
997
998// We need to handle this here because tablegen doesn't support matching
999// instructions with multiple outputs.
1000void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1001 SDLoc SL(N);
1002 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1003 unsigned Opc;
1004 if (Subtarget->hasMADIntraFwdBug())
1005 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1006 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1007 else
1008 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1009
1010 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1011 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1012 Clamp };
1013 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1014}
1015
1016// We need to handle this here because tablegen doesn't support matching
1017// instructions with multiple outputs.
1018void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1019 SDLoc SL(N);
1020 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1021 unsigned Opc;
1022 if (Subtarget->hasMADIntraFwdBug())
1023 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1024 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1025 else
1026 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1027
1028 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1029 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1030 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1031 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1032 if (!SDValue(N, 0).use_empty()) {
1033 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1034 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1035 MVT::i32, SDValue(Mad, 0), Sub0);
1036 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1037 }
1038 if (!SDValue(N, 1).use_empty()) {
1039 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1040 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1041 MVT::i32, SDValue(Mad, 0), Sub1);
1042 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1043 }
1045}
1046
1047bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1048 if (!isUInt<16>(Offset))
1049 return false;
1050
1051 if (!Base || Subtarget->hasUsableDSOffset() ||
1052 Subtarget->unsafeDSOffsetFoldingEnabled())
1053 return true;
1054
1055 // On Southern Islands instruction with a negative base value and an offset
1056 // don't seem to work.
1057 return CurDAG->SignBitIsZero(Base);
1058}
1059
1060bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1061 SDValue &Offset) const {
1062 SDLoc DL(Addr);
1064 SDValue N0 = Addr.getOperand(0);
1065 SDValue N1 = Addr.getOperand(1);
1066 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1067 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1068 // (add n0, c0)
1069 Base = N0;
1070 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1071 return true;
1072 }
1073 } else if (Addr.getOpcode() == ISD::SUB) {
1074 // sub C, x -> add (sub 0, x), C
1075 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1076 int64_t ByteOffset = C->getSExtValue();
1077 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1078 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1079
1080 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1081 // the known bits in isDSOffsetLegal. We need to emit the selected node
1082 // here, so this is thrown away.
1083 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1084 Zero, Addr.getOperand(1));
1085
1086 if (isDSOffsetLegal(Sub, ByteOffset)) {
1088 Opnds.push_back(Zero);
1089 Opnds.push_back(Addr.getOperand(1));
1090
1091 // FIXME: Select to VOP3 version for with-carry.
1092 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1093 if (Subtarget->hasAddNoCarry()) {
1094 SubOp = AMDGPU::V_SUB_U32_e64;
1095 Opnds.push_back(
1096 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1097 }
1098
1099 MachineSDNode *MachineSub =
1100 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1101
1102 Base = SDValue(MachineSub, 0);
1103 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1104 return true;
1105 }
1106 }
1107 }
1108 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1109 // If we have a constant address, prefer to put the constant into the
1110 // offset. This can save moves to load the constant address since multiple
1111 // operations can share the zero base address register, and enables merging
1112 // into read2 / write2 instructions.
1113
1114 SDLoc DL(Addr);
1115
1116 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1117 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1118 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1119 DL, MVT::i32, Zero);
1120 Base = SDValue(MovZero, 0);
1121 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1122 return true;
1123 }
1124 }
1125
1126 // default case
1127 Base = Addr;
1128 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1129 return true;
1130}
1131
1132bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1133 unsigned Offset1,
1134 unsigned Size) const {
1135 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1136 return false;
1137 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1138 return false;
1139
1140 if (!Base || Subtarget->hasUsableDSOffset() ||
1141 Subtarget->unsafeDSOffsetFoldingEnabled())
1142 return true;
1143
1144 // On Southern Islands instruction with a negative base value and an offset
1145 // don't seem to work.
1146 return CurDAG->SignBitIsZero(Base);
1147}
1148
1149bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
1150 uint64_t FlatVariant) const {
1151 if (FlatVariant != SIInstrFlags::FlatScratch)
1152 return true;
1153 // When value in 32-bit Base can be negative calculate scratch offset using
1154 // 32-bit add instruction, otherwise use Base(unsigned) + offset.
1155 return CurDAG->SignBitIsZero(Base);
1156}
1157
1158// TODO: If offset is too big, put low 16-bit into offset.
1159bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1160 SDValue &Offset0,
1161 SDValue &Offset1) const {
1162 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1163}
1164
1165bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1166 SDValue &Offset0,
1167 SDValue &Offset1) const {
1168 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1169}
1170
1171bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1172 SDValue &Offset0, SDValue &Offset1,
1173 unsigned Size) const {
1174 SDLoc DL(Addr);
1175
1177 SDValue N0 = Addr.getOperand(0);
1178 SDValue N1 = Addr.getOperand(1);
1179 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1180 unsigned OffsetValue0 = C1->getZExtValue();
1181 unsigned OffsetValue1 = OffsetValue0 + Size;
1182
1183 // (add n0, c0)
1184 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1185 Base = N0;
1186 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1187 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1188 return true;
1189 }
1190 } else if (Addr.getOpcode() == ISD::SUB) {
1191 // sub C, x -> add (sub 0, x), C
1192 if (const ConstantSDNode *C =
1193 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1194 unsigned OffsetValue0 = C->getZExtValue();
1195 unsigned OffsetValue1 = OffsetValue0 + Size;
1196
1197 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1198 SDLoc DL(Addr);
1199 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1200
1201 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1202 // the known bits in isDSOffsetLegal. We need to emit the selected node
1203 // here, so this is thrown away.
1204 SDValue Sub =
1205 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1206
1207 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1209 Opnds.push_back(Zero);
1210 Opnds.push_back(Addr.getOperand(1));
1211 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1212 if (Subtarget->hasAddNoCarry()) {
1213 SubOp = AMDGPU::V_SUB_U32_e64;
1214 Opnds.push_back(
1215 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1216 }
1217
1218 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1219 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1220
1221 Base = SDValue(MachineSub, 0);
1222 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1223 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1224 return true;
1225 }
1226 }
1227 }
1228 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1229 unsigned OffsetValue0 = CAddr->getZExtValue();
1230 unsigned OffsetValue1 = OffsetValue0 + Size;
1231
1232 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1233 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1234 MachineSDNode *MovZero =
1235 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1236 Base = SDValue(MovZero, 0);
1237 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1238 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1239 return true;
1240 }
1241 }
1242
1243 // default case
1244
1245 Base = Addr;
1246 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1247 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1248 return true;
1249}
1250
1251bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1252 SDValue &SOffset, SDValue &Offset,
1253 SDValue &Offen, SDValue &Idxen,
1254 SDValue &Addr64) const {
1255 // Subtarget prefers to use flat instruction
1256 // FIXME: This should be a pattern predicate and not reach here
1257 if (Subtarget->useFlatForGlobal())
1258 return false;
1259
1260 SDLoc DL(Addr);
1261
1262 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1263 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1264 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1265 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1266
1267 ConstantSDNode *C1 = nullptr;
1268 SDValue N0 = Addr;
1270 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1271 if (isUInt<32>(C1->getZExtValue()))
1272 N0 = Addr.getOperand(0);
1273 else
1274 C1 = nullptr;
1275 }
1276
1277 if (N0.getOpcode() == ISD::ADD) {
1278 // (add N2, N3) -> addr64, or
1279 // (add (add N2, N3), C1) -> addr64
1280 SDValue N2 = N0.getOperand(0);
1281 SDValue N3 = N0.getOperand(1);
1282 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1283
1284 if (N2->isDivergent()) {
1285 if (N3->isDivergent()) {
1286 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1287 // addr64, and construct the resource from a 0 address.
1288 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1289 VAddr = N0;
1290 } else {
1291 // N2 is divergent, N3 is not.
1292 Ptr = N3;
1293 VAddr = N2;
1294 }
1295 } else {
1296 // N2 is not divergent.
1297 Ptr = N2;
1298 VAddr = N3;
1299 }
1300 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1301 } else if (N0->isDivergent()) {
1302 // N0 is divergent. Use it as the addr64, and construct the resource from a
1303 // 0 address.
1304 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1305 VAddr = N0;
1306 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1307 } else {
1308 // N0 -> offset, or
1309 // (N0 + C1) -> offset
1310 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1311 Ptr = N0;
1312 }
1313
1314 if (!C1) {
1315 // No offset.
1316 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1317 return true;
1318 }
1319
1321 // Legal offset for instruction.
1322 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1323 return true;
1324 }
1325
1326 // Illegal offset, store it in soffset.
1327 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1328 SOffset =
1330 AMDGPU::S_MOV_B32, DL, MVT::i32,
1331 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1332 0);
1333 return true;
1334}
1335
1336bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1337 SDValue &VAddr, SDValue &SOffset,
1338 SDValue &Offset) const {
1339 SDValue Ptr, Offen, Idxen, Addr64;
1340
1341 // addr64 bit was removed for volcanic islands.
1342 // FIXME: This should be a pattern predicate and not reach here
1343 if (!Subtarget->hasAddr64())
1344 return false;
1345
1346 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1347 return false;
1348
1349 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1350 if (C->getSExtValue()) {
1351 SDLoc DL(Addr);
1352
1353 const SITargetLowering& Lowering =
1354 *static_cast<const SITargetLowering*>(getTargetLowering());
1355
1356 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1357 return true;
1358 }
1359
1360 return false;
1361}
1362
1363std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1364 SDLoc DL(N);
1365
1366 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1367 SDValue TFI =
1368 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1369
1370 // We rebase the base address into an absolute stack address and hence
1371 // use constant 0 for soffset. This value must be retained until
1372 // frame elimination and eliminateFrameIndex will choose the appropriate
1373 // frame register if need be.
1374 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1375}
1376
1377bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1378 SDValue Addr, SDValue &Rsrc,
1379 SDValue &VAddr, SDValue &SOffset,
1380 SDValue &ImmOffset) const {
1381
1382 SDLoc DL(Addr);
1385
1386 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1387
1388 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1389 int64_t Imm = CAddr->getSExtValue();
1390 const int64_t NullPtr =
1392 // Don't fold null pointer.
1393 if (Imm != NullPtr) {
1394 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
1395 SDValue HighBits =
1396 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1397 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1398 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1399 VAddr = SDValue(MovHighBits, 0);
1400
1401 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1402 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1403 return true;
1404 }
1405 }
1406
1408 // (add n0, c1)
1409
1410 SDValue N0 = Addr.getOperand(0);
1411 SDValue N1 = Addr.getOperand(1);
1412
1413 // Offsets in vaddr must be positive if range checking is enabled.
1414 //
1415 // The total computation of vaddr + soffset + offset must not overflow. If
1416 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1417 // overflowing.
1418 //
1419 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1420 // always perform a range check. If a negative vaddr base index was used,
1421 // this would fail the range check. The overall address computation would
1422 // compute a valid address, but this doesn't happen due to the range
1423 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1424 //
1425 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1426 // MUBUF vaddr, but not on older subtargets which can only do this if the
1427 // sign bit is known 0.
1428 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1430 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1431 CurDAG->SignBitIsZero(N0))) {
1432 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1433 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1434 return true;
1435 }
1436 }
1437
1438 // (node)
1439 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1440 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1441 return true;
1442}
1443
1444static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1445 if (Val.getOpcode() != ISD::CopyFromReg)
1446 return false;
1447 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1448 if (!Reg.isPhysical())
1449 return false;
1450 auto RC = TRI.getPhysRegBaseClass(Reg);
1451 return RC && TRI.isSGPRClass(RC);
1452}
1453
1454bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1455 SDValue Addr,
1456 SDValue &SRsrc,
1457 SDValue &SOffset,
1458 SDValue &Offset) const {
1459 const SIRegisterInfo *TRI =
1460 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1463 SDLoc DL(Addr);
1464
1465 // CopyFromReg <sgpr>
1466 if (IsCopyFromSGPR(*TRI, Addr)) {
1467 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1468 SOffset = Addr;
1469 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1470 return true;
1471 }
1472
1473 ConstantSDNode *CAddr;
1474 if (Addr.getOpcode() == ISD::ADD) {
1475 // Add (CopyFromReg <sgpr>) <constant>
1476 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1477 if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1478 return false;
1479 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1480 return false;
1481
1482 SOffset = Addr.getOperand(0);
1483 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1485 // <constant>
1486 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1487 } else {
1488 return false;
1489 }
1490
1491 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1492
1493 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1494 return true;
1495}
1496
1497bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1498 SDValue &SOffset, SDValue &Offset
1499 ) const {
1500 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1501 const SIInstrInfo *TII =
1502 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1503
1504 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1505 return false;
1506
1507 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1508 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1509 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1510 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1511 APInt::getAllOnes(32).getZExtValue(); // Size
1512 SDLoc DL(Addr);
1513
1514 const SITargetLowering& Lowering =
1515 *static_cast<const SITargetLowering*>(getTargetLowering());
1516
1517 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1518 return true;
1519 }
1520 return false;
1521}
1522
1523// Find a load or store from corresponding pattern root.
1524// Roots may be build_vector, bitconvert or their combinations.
1527 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1528 return MN;
1529 assert(isa<BuildVectorSDNode>(N));
1530 for (SDValue V : N->op_values())
1531 if (MemSDNode *MN =
1532 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1533 return MN;
1534 llvm_unreachable("cannot find MemSDNode in the pattern!");
1535}
1536
1537bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1538 SDValue &VAddr, SDValue &Offset,
1539 uint64_t FlatVariant) const {
1540 int64_t OffsetVal = 0;
1541
1542 unsigned AS = findMemSDNode(N)->getAddressSpace();
1543
1544 bool CanHaveFlatSegmentOffsetBug =
1545 Subtarget->hasFlatSegmentOffsetBug() &&
1546 FlatVariant == SIInstrFlags::FLAT &&
1548
1549 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1550 SDValue N0, N1;
1551 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1552 isFlatScratchBaseLegal(N0, FlatVariant)) {
1553 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1554
1555 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1556 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1557 Addr = N0;
1558 OffsetVal = COffsetVal;
1559 } else {
1560 // If the offset doesn't fit, put the low bits into the offset field and
1561 // add the rest.
1562 //
1563 // For a FLAT instruction the hardware decides whether to access
1564 // global/scratch/shared memory based on the high bits of vaddr,
1565 // ignoring the offset field, so we have to ensure that when we add
1566 // remainder to vaddr it still points into the same underlying object.
1567 // The easiest way to do that is to make sure that we split the offset
1568 // into two pieces that are both >= 0 or both <= 0.
1569
1570 SDLoc DL(N);
1571 uint64_t RemainderOffset;
1572
1573 std::tie(OffsetVal, RemainderOffset) =
1574 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1575
1576 SDValue AddOffsetLo =
1577 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1578 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1579
1580 if (Addr.getValueType().getSizeInBits() == 32) {
1582 Opnds.push_back(N0);
1583 Opnds.push_back(AddOffsetLo);
1584 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1585 if (Subtarget->hasAddNoCarry()) {
1586 AddOp = AMDGPU::V_ADD_U32_e64;
1587 Opnds.push_back(Clamp);
1588 }
1589 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1590 } else {
1591 // TODO: Should this try to use a scalar add pseudo if the base address
1592 // is uniform and saddr is usable?
1593 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1594 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1595
1596 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1597 DL, MVT::i32, N0, Sub0);
1598 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1599 DL, MVT::i32, N0, Sub1);
1600
1601 SDValue AddOffsetHi =
1602 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1603
1604 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1605
1606 SDNode *Add =
1607 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1608 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1609
1610 SDNode *Addc = CurDAG->getMachineNode(
1611 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1612 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1613
1614 SDValue RegSequenceArgs[] = {
1615 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1616 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1617
1618 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1619 MVT::i64, RegSequenceArgs),
1620 0);
1621 }
1622 }
1623 }
1624 }
1625
1626 VAddr = Addr;
1627 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1628 return true;
1629}
1630
1631bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1632 SDValue &VAddr,
1633 SDValue &Offset) const {
1634 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1635}
1636
1637bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1638 SDValue &VAddr,
1639 SDValue &Offset) const {
1640 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1641}
1642
1643bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1644 SDValue &VAddr,
1645 SDValue &Offset) const {
1646 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1648}
1649
1650// If this matches zero_extend i32:x, return x
1652 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1653 return SDValue();
1654
1655 SDValue ExtSrc = Op.getOperand(0);
1656 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1657}
1658
1659// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1660bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1661 SDValue Addr,
1662 SDValue &SAddr,
1663 SDValue &VOffset,
1664 SDValue &Offset) const {
1665 int64_t ImmOffset = 0;
1666
1667 // Match the immediate offset first, which canonically is moved as low as
1668 // possible.
1669
1670 SDValue LHS, RHS;
1671 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1672 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1673 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1674
1675 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1677 Addr = LHS;
1678 ImmOffset = COffsetVal;
1679 } else if (!LHS->isDivergent()) {
1680 if (COffsetVal > 0) {
1681 SDLoc SL(N);
1682 // saddr + large_offset -> saddr +
1683 // (voffset = large_offset & ~MaxOffset) +
1684 // (large_offset & MaxOffset);
1685 int64_t SplitImmOffset, RemainderOffset;
1686 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1688
1689 if (isUInt<32>(RemainderOffset)) {
1690 SDNode *VMov = CurDAG->getMachineNode(
1691 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1692 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1693 VOffset = SDValue(VMov, 0);
1694 SAddr = LHS;
1695 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1696 return true;
1697 }
1698 }
1699
1700 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1701 // is 1 we would need to perform 1 or 2 extra moves for each half of
1702 // the constant and it is better to do a scalar add and then issue a
1703 // single VALU instruction to materialize zero. Otherwise it is less
1704 // instructions to perform VALU adds with immediates or inline literals.
1705 unsigned NumLiterals =
1706 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1707 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1708 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1709 return false;
1710 }
1711 }
1712
1713 // Match the variable offset.
1714 if (Addr.getOpcode() == ISD::ADD) {
1715 LHS = Addr.getOperand(0);
1716 RHS = Addr.getOperand(1);
1717
1718 if (!LHS->isDivergent()) {
1719 // add (i64 sgpr), (zero_extend (i32 vgpr))
1720 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1721 SAddr = LHS;
1722 VOffset = ZextRHS;
1723 }
1724 }
1725
1726 if (!SAddr && !RHS->isDivergent()) {
1727 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1728 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1729 SAddr = RHS;
1730 VOffset = ZextLHS;
1731 }
1732 }
1733
1734 if (SAddr) {
1735 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1736 return true;
1737 }
1738 }
1739
1740 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1741 isa<ConstantSDNode>(Addr))
1742 return false;
1743
1744 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1745 // moves required to copy a 64-bit SGPR to VGPR.
1746 SAddr = Addr;
1747 SDNode *VMov =
1748 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1749 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1750 VOffset = SDValue(VMov, 0);
1751 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1752 return true;
1753}
1754
1756 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1757 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1758 } else if (SAddr.getOpcode() == ISD::ADD &&
1759 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1760 // Materialize this into a scalar move for scalar address to avoid
1761 // readfirstlane.
1762 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1763 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1764 FI->getValueType(0));
1765 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1766 MVT::i32, TFI, SAddr.getOperand(1)),
1767 0);
1768 }
1769
1770 return SAddr;
1771}
1772
1773// Match (32-bit SGPR base) + sext(imm offset)
1774bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1775 SDValue &SAddr,
1776 SDValue &Offset) const {
1777 if (Addr->isDivergent())
1778 return false;
1779
1780 SDLoc DL(Addr);
1781
1782 int64_t COffsetVal = 0;
1783
1785 isFlatScratchBaseLegal(Addr.getOperand(0))) {
1786 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1787 SAddr = Addr.getOperand(0);
1788 } else {
1789 SAddr = Addr;
1790 }
1791
1792 SAddr = SelectSAddrFI(CurDAG, SAddr);
1793
1794 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1795
1796 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1798 int64_t SplitImmOffset, RemainderOffset;
1799 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1801
1802 COffsetVal = SplitImmOffset;
1803
1804 SDValue AddOffset =
1806 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1807 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1808 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1809 SAddr, AddOffset),
1810 0);
1811 }
1812
1813 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1814
1815 return true;
1816}
1817
1818// Check whether the flat scratch SVS swizzle bug affects this access.
1819bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1820 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1821 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1822 return false;
1823
1824 // The bug affects the swizzling of SVS accesses if there is any carry out
1825 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1826 // voffset to (soffset + inst_offset).
1827 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1829 true, false, CurDAG->computeKnownBits(SAddr),
1830 KnownBits::makeConstant(APInt(32, ImmOffset)));
1831 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1833 return (VMax & 3) + (SMax & 3) >= 4;
1834}
1835
1836bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1837 SDValue &VAddr, SDValue &SAddr,
1838 SDValue &Offset) const {
1839 int64_t ImmOffset = 0;
1840
1841 SDValue LHS, RHS;
1842 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1843 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1844 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1845
1846 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1847 Addr = LHS;
1848 ImmOffset = COffsetVal;
1849 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1850 SDLoc SL(N);
1851 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1852 // (large_offset & MaxOffset);
1853 int64_t SplitImmOffset, RemainderOffset;
1854 std::tie(SplitImmOffset, RemainderOffset)
1855 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1856
1857 if (isUInt<32>(RemainderOffset)) {
1858 SDNode *VMov = CurDAG->getMachineNode(
1859 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1860 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1861 VAddr = SDValue(VMov, 0);
1862 SAddr = LHS;
1863 if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
1864 return false;
1865 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1866 return false;
1867 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1868 return true;
1869 }
1870 }
1871 }
1872
1873 if (Addr.getOpcode() != ISD::ADD)
1874 return false;
1875
1876 LHS = Addr.getOperand(0);
1877 RHS = Addr.getOperand(1);
1878
1879 if (!LHS->isDivergent() && RHS->isDivergent()) {
1880 SAddr = LHS;
1881 VAddr = RHS;
1882 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1883 SAddr = RHS;
1884 VAddr = LHS;
1885 } else {
1886 return false;
1887 }
1888
1889 if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
1890 return false;
1891
1892 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1893 return false;
1894 SAddr = SelectSAddrFI(CurDAG, SAddr);
1895 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1896 return true;
1897}
1898
1899// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
1900// not null) offset. If Imm32Only is true, match only 32-bit immediate
1901// offsets available on CI.
1902bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1903 SDValue *SOffset, SDValue *Offset,
1904 bool Imm32Only, bool IsBuffer) const {
1905 assert((!SOffset || !Offset) &&
1906 "Cannot match both soffset and offset at the same time!");
1907
1908 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1909 if (!C) {
1910 if (!SOffset)
1911 return false;
1912 if (ByteOffsetNode.getValueType().isScalarInteger() &&
1913 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1914 *SOffset = ByteOffsetNode;
1915 return true;
1916 }
1917 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1918 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1919 *SOffset = ByteOffsetNode.getOperand(0);
1920 return true;
1921 }
1922 }
1923 return false;
1924 }
1925
1926 SDLoc SL(ByteOffsetNode);
1927
1928 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
1929 // offset for S_BUFFER instructions is unsigned.
1930 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
1931 std::optional<int64_t> EncodedOffset =
1932 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
1933 if (EncodedOffset && Offset && !Imm32Only) {
1934 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1935 return true;
1936 }
1937
1938 // SGPR and literal offsets are unsigned.
1939 if (ByteOffset < 0)
1940 return false;
1941
1942 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1943 if (EncodedOffset && Offset && Imm32Only) {
1944 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1945 return true;
1946 }
1947
1948 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1949 return false;
1950
1951 if (SOffset) {
1952 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1953 *SOffset = SDValue(
1954 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1955 return true;
1956 }
1957
1958 return false;
1959}
1960
1961SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1962 if (Addr.getValueType() != MVT::i32)
1963 return Addr;
1964
1965 // Zero-extend a 32-bit address.
1966 SDLoc SL(Addr);
1967
1970 unsigned AddrHiVal = Info->get32BitAddressHighBits();
1971 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1972
1973 const SDValue Ops[] = {
1974 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1975 Addr,
1976 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1977 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1978 0),
1979 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1980 };
1981
1982 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1983 Ops), 0);
1984}
1985
1986// Match a base and an immediate (if Offset is not null) or an SGPR (if
1987// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
1988// true, match only 32-bit immediate offsets available on CI.
1989bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
1990 SDValue *SOffset, SDValue *Offset,
1991 bool Imm32Only,
1992 bool IsBuffer) const {
1993 if (SOffset && Offset) {
1994 assert(!Imm32Only && !IsBuffer);
1995 SDValue B;
1996 return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
1997 SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
1998 }
1999
2000 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2001 // wraparound, because s_load instructions perform the addition in 64 bits.
2002 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2003 !Addr->getFlags().hasNoUnsignedWrap())
2004 return false;
2005
2006 SDValue N0, N1;
2007 // Extract the base and offset if possible.
2008 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2009 N0 = Addr.getOperand(0);
2010 N1 = Addr.getOperand(1);
2011 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2012 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2013 }
2014 if (!N0 || !N1)
2015 return false;
2016 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2017 SBase = N0;
2018 return true;
2019 }
2020 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2021 SBase = N1;
2022 return true;
2023 }
2024 return false;
2025}
2026
2027bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2028 SDValue *SOffset, SDValue *Offset,
2029 bool Imm32Only) const {
2030 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2031 SBase = Expand32BitAddress(SBase);
2032 return true;
2033 }
2034
2035 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2036 SBase = Expand32BitAddress(Addr);
2037 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2038 return true;
2039 }
2040
2041 return false;
2042}
2043
2044bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2045 SDValue &Offset) const {
2046 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2047}
2048
2049bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2050 SDValue &Offset) const {
2052 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2053 /* Imm32Only */ true);
2054}
2055
2056bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2057 SDValue &SOffset) const {
2058 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2059}
2060
2061bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2062 SDValue &SOffset,
2063 SDValue &Offset) const {
2064 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2065}
2066
2067bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2068 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2069 /* Imm32Only */ false, /* IsBuffer */ true);
2070}
2071
2072bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2073 SDValue &Offset) const {
2075 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2076 /* Imm32Only */ true, /* IsBuffer */ true);
2077}
2078
2079bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2080 SDValue &Offset) const {
2081 // Match the (soffset + offset) pair as a 32-bit register base and
2082 // an immediate offset.
2083 return N.getValueType() == MVT::i32 &&
2084 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2085 &Offset, /* Imm32Only */ false,
2086 /* IsBuffer */ true);
2087}
2088
2089bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2090 SDValue &Base,
2091 SDValue &Offset) const {
2092 SDLoc DL(Index);
2093
2095 SDValue N0 = Index.getOperand(0);
2096 SDValue N1 = Index.getOperand(1);
2097 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2098
2099 // (add n0, c0)
2100 // Don't peel off the offset (c0) if doing so could possibly lead
2101 // the base (n0) to be negative.
2102 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2103 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2104 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2105 Base = N0;
2106 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2107 return true;
2108 }
2109 }
2110
2111 if (isa<ConstantSDNode>(Index))
2112 return false;
2113
2114 Base = Index;
2115 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2116 return true;
2117}
2118
2119SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2120 SDValue Val, uint32_t Offset,
2121 uint32_t Width) {
2122 if (Val->isDivergent()) {
2123 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2125 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2126
2127 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2128 }
2129 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2130 // Transformation function, pack the offset and width of a BFE into
2131 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2132 // source, bits [5:0] contain the offset and bits [22:16] the width.
2133 uint32_t PackedVal = Offset | (Width << 16);
2134 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2135
2136 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2137}
2138
2139void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2140 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2141 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2142 // Predicate: 0 < b <= c < 32
2143
2144 const SDValue &Shl = N->getOperand(0);
2145 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2146 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2147
2148 if (B && C) {
2149 uint32_t BVal = B->getZExtValue();
2150 uint32_t CVal = C->getZExtValue();
2151
2152 if (0 < BVal && BVal <= CVal && CVal < 32) {
2153 bool Signed = N->getOpcode() == ISD::SRA;
2154 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2155 32 - CVal));
2156 return;
2157 }
2158 }
2159 SelectCode(N);
2160}
2161
2162void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2163 switch (N->getOpcode()) {
2164 case ISD::AND:
2165 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2166 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2167 // Predicate: isMask(mask)
2168 const SDValue &Srl = N->getOperand(0);
2169 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2170 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2171
2172 if (Shift && Mask) {
2173 uint32_t ShiftVal = Shift->getZExtValue();
2174 uint32_t MaskVal = Mask->getZExtValue();
2175
2176 if (isMask_32(MaskVal)) {
2177 uint32_t WidthVal = llvm::popcount(MaskVal);
2178 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2179 WidthVal));
2180 return;
2181 }
2182 }
2183 }
2184 break;
2185 case ISD::SRL:
2186 if (N->getOperand(0).getOpcode() == ISD::AND) {
2187 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2188 // Predicate: isMask(mask >> b)
2189 const SDValue &And = N->getOperand(0);
2190 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2191 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2192
2193 if (Shift && Mask) {
2194 uint32_t ShiftVal = Shift->getZExtValue();
2195 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2196
2197 if (isMask_32(MaskVal)) {
2198 uint32_t WidthVal = llvm::popcount(MaskVal);
2199 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2200 WidthVal));
2201 return;
2202 }
2203 }
2204 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2205 SelectS_BFEFromShifts(N);
2206 return;
2207 }
2208 break;
2209 case ISD::SRA:
2210 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2211 SelectS_BFEFromShifts(N);
2212 return;
2213 }
2214 break;
2215
2217 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2218 SDValue Src = N->getOperand(0);
2219 if (Src.getOpcode() != ISD::SRL)
2220 break;
2221
2222 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2223 if (!Amt)
2224 break;
2225
2226 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2227 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2228 Amt->getZExtValue(), Width));
2229 return;
2230 }
2231 }
2232
2233 SelectCode(N);
2234}
2235
2236bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2237 assert(N->getOpcode() == ISD::BRCOND);
2238 if (!N->hasOneUse())
2239 return false;
2240
2241 SDValue Cond = N->getOperand(1);
2242 if (Cond.getOpcode() == ISD::CopyToReg)
2243 Cond = Cond.getOperand(2);
2244
2245 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2246 return false;
2247
2248 MVT VT = Cond.getOperand(0).getSimpleValueType();
2249 if (VT == MVT::i32)
2250 return true;
2251
2252 if (VT == MVT::i64) {
2253 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2254
2255 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2256 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2257 }
2258
2259 return false;
2260}
2261
2262void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2263 SDValue Cond = N->getOperand(1);
2264
2265 if (Cond.isUndef()) {
2266 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2267 N->getOperand(2), N->getOperand(0));
2268 return;
2269 }
2270
2271 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2272 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2273
2274 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2275 unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2276 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2277 SDLoc SL(N);
2278
2279 if (!UseSCCBr) {
2280 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2281 // analyzed what generates the vcc value, so we do not know whether vcc
2282 // bits for disabled lanes are 0. Thus we need to mask out bits for
2283 // disabled lanes.
2284 //
2285 // For the case that we select S_CBRANCH_SCC1 and it gets
2286 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2287 // SIInstrInfo::moveToVALU which inserts the S_AND).
2288 //
2289 // We could add an analysis of what generates the vcc value here and omit
2290 // the S_AND when is unnecessary. But it would be better to add a separate
2291 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2292 // catches both cases.
2293 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2294 : AMDGPU::S_AND_B64,
2295 SL, MVT::i1,
2296 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2297 : AMDGPU::EXEC,
2298 MVT::i1),
2299 Cond),
2300 0);
2301 }
2302
2303 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2304 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2305 N->getOperand(2), // Basic Block
2306 VCC.getValue(0));
2307}
2308
2309void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2310 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2311 !N->isDivergent()) {
2312 SDValue Src = N->getOperand(0);
2313 if (Src.getValueType() == MVT::f16) {
2314 if (isExtractHiElt(Src, Src)) {
2315 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2316 {Src});
2317 return;
2318 }
2319 }
2320 }
2321
2322 SelectCode(N);
2323}
2324
2325void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2326 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2327 // be copied to an SGPR with readfirstlane.
2328 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2329 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2330
2331 SDValue Chain = N->getOperand(0);
2332 SDValue Ptr = N->getOperand(2);
2333 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2334 MachineMemOperand *MMO = M->getMemOperand();
2335 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2336
2339 SDValue PtrBase = Ptr.getOperand(0);
2340 SDValue PtrOffset = Ptr.getOperand(1);
2341
2342 const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2343 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2344 N = glueCopyToM0(N, PtrBase);
2345 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2346 }
2347 }
2348
2349 if (!Offset) {
2350 N = glueCopyToM0(N, Ptr);
2351 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2352 }
2353
2354 SDValue Ops[] = {
2355 Offset,
2356 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2357 Chain,
2358 N->getOperand(N->getNumOperands() - 1) // New glue
2359 };
2360
2361 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2362 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2363}
2364
2365// We need to handle this here because tablegen doesn't support matching
2366// instructions with multiple outputs.
2367void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2368 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2369 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2370 N->getOperand(5), N->getOperand(0)};
2371
2372 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2373 MachineMemOperand *MMO = M->getMemOperand();
2374 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2375 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2376}
2377
2378static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2379 switch (IntrID) {
2380 case Intrinsic::amdgcn_ds_gws_init:
2381 return AMDGPU::DS_GWS_INIT;
2382 case Intrinsic::amdgcn_ds_gws_barrier:
2383 return AMDGPU::DS_GWS_BARRIER;
2384 case Intrinsic::amdgcn_ds_gws_sema_v:
2385 return AMDGPU::DS_GWS_SEMA_V;
2386 case Intrinsic::amdgcn_ds_gws_sema_br:
2387 return AMDGPU::DS_GWS_SEMA_BR;
2388 case Intrinsic::amdgcn_ds_gws_sema_p:
2389 return AMDGPU::DS_GWS_SEMA_P;
2390 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2391 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2392 default:
2393 llvm_unreachable("not a gws intrinsic");
2394 }
2395}
2396
2397void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2398 if (!Subtarget->hasGWS() ||
2399 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2400 !Subtarget->hasGWSSemaReleaseAll())) {
2401 // Let this error.
2402 SelectCode(N);
2403 return;
2404 }
2405
2406 // Chain, intrinsic ID, vsrc, offset
2407 const bool HasVSrc = N->getNumOperands() == 4;
2408 assert(HasVSrc || N->getNumOperands() == 3);
2409
2410 SDLoc SL(N);
2411 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2412 int ImmOffset = 0;
2413 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2414 MachineMemOperand *MMO = M->getMemOperand();
2415
2416 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2417 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2418
2419 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2420 // offset field) % 64. Some versions of the programming guide omit the m0
2421 // part, or claim it's from offset 0.
2422 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2423 // If we have a constant offset, try to use the 0 in m0 as the base.
2424 // TODO: Look into changing the default m0 initialization value. If the
2425 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2426 // the immediate offset.
2427 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2428 ImmOffset = ConstOffset->getZExtValue();
2429 } else {
2430 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2431 ImmOffset = BaseOffset.getConstantOperandVal(1);
2432 BaseOffset = BaseOffset.getOperand(0);
2433 }
2434
2435 // Prefer to do the shift in an SGPR since it should be possible to use m0
2436 // as the result directly. If it's already an SGPR, it will be eliminated
2437 // later.
2438 SDNode *SGPROffset
2439 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2440 BaseOffset);
2441 // Shift to offset in m0
2442 SDNode *M0Base
2443 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2444 SDValue(SGPROffset, 0),
2445 CurDAG->getTargetConstant(16, SL, MVT::i32));
2446 glueCopyToM0(N, SDValue(M0Base, 0));
2447 }
2448
2449 SDValue Chain = N->getOperand(0);
2450 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2451
2452 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2454 if (HasVSrc)
2455 Ops.push_back(N->getOperand(2));
2456 Ops.push_back(OffsetField);
2457 Ops.push_back(Chain);
2458
2459 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2460 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2461}
2462
2463void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2464 if (Subtarget->getLDSBankCount() != 16) {
2465 // This is a single instruction with a pattern.
2466 SelectCode(N);
2467 return;
2468 }
2469
2470 SDLoc DL(N);
2471
2472 // This requires 2 instructions. It is possible to write a pattern to support
2473 // this, but the generated isel emitter doesn't correctly deal with multiple
2474 // output instructions using the same physical register input. The copy to m0
2475 // is incorrectly placed before the second instruction.
2476 //
2477 // TODO: Match source modifiers.
2478 //
2479 // def : Pat <
2480 // (int_amdgcn_interp_p1_f16
2481 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2482 // (i32 timm:$attrchan), (i32 timm:$attr),
2483 // (i1 timm:$high), M0),
2484 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2485 // timm:$attrchan, 0,
2486 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2487 // let Predicates = [has16BankLDS];
2488 // }
2489
2490 // 16 bank LDS
2491 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2492 N->getOperand(5), SDValue());
2493
2494 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2495
2496 SDNode *InterpMov =
2497 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2498 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2499 N->getOperand(3), // Attr
2500 N->getOperand(2), // Attrchan
2501 ToM0.getValue(1) // In glue
2502 });
2503
2504 SDNode *InterpP1LV =
2505 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2506 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2507 N->getOperand(1), // Src0
2508 N->getOperand(3), // Attr
2509 N->getOperand(2), // Attrchan
2510 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2511 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2512 N->getOperand(4), // high
2513 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2514 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2515 SDValue(InterpMov, 1)
2516 });
2517
2518 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2519}
2520
2521void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2522 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2523 switch (IntrID) {
2524 case Intrinsic::amdgcn_ds_append:
2525 case Intrinsic::amdgcn_ds_consume: {
2526 if (N->getValueType(0) != MVT::i32)
2527 break;
2528 SelectDSAppendConsume(N, IntrID);
2529 return;
2530 }
2531 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2532 SelectDSBvhStackIntrinsic(N);
2533 return;
2534 }
2535
2536 SelectCode(N);
2537}
2538
2539void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2540 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2541 unsigned Opcode;
2542 switch (IntrID) {
2543 case Intrinsic::amdgcn_wqm:
2544 Opcode = AMDGPU::WQM;
2545 break;
2546 case Intrinsic::amdgcn_softwqm:
2547 Opcode = AMDGPU::SOFT_WQM;
2548 break;
2549 case Intrinsic::amdgcn_wwm:
2550 case Intrinsic::amdgcn_strict_wwm:
2551 Opcode = AMDGPU::STRICT_WWM;
2552 break;
2553 case Intrinsic::amdgcn_strict_wqm:
2554 Opcode = AMDGPU::STRICT_WQM;
2555 break;
2556 case Intrinsic::amdgcn_interp_p1_f16:
2557 SelectInterpP1F16(N);
2558 return;
2559 case Intrinsic::amdgcn_inverse_ballot:
2560 switch (N->getOperand(1).getValueSizeInBits()) {
2561 case 32:
2562 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2563 break;
2564 case 64:
2565 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2566 break;
2567 default:
2568 llvm_unreachable("Unsupported size for inverse ballot mask.");
2569 }
2570 break;
2571 default:
2572 SelectCode(N);
2573 return;
2574 }
2575
2576 SDValue Src = N->getOperand(1);
2577 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2578}
2579
2580void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2581 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2582 switch (IntrID) {
2583 case Intrinsic::amdgcn_ds_gws_init:
2584 case Intrinsic::amdgcn_ds_gws_barrier:
2585 case Intrinsic::amdgcn_ds_gws_sema_v:
2586 case Intrinsic::amdgcn_ds_gws_sema_br:
2587 case Intrinsic::amdgcn_ds_gws_sema_p:
2588 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2589 SelectDS_GWS(N, IntrID);
2590 return;
2591 default:
2592 break;
2593 }
2594
2595 SelectCode(N);
2596}
2597
2598void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2599 SDValue Log2WaveSize =
2600 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2601 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2602 {N->getOperand(0), Log2WaveSize});
2603}
2604
2605void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2606 SDValue SrcVal = N->getOperand(1);
2607 if (SrcVal.getValueType() != MVT::i32) {
2608 SelectCode(N); // Emit default error
2609 return;
2610 }
2611
2612 SDValue CopyVal;
2614 SDLoc SL(N);
2615
2616 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2617 CopyVal = SrcVal.getOperand(0);
2618 } else {
2619 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2620 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2621
2622 if (N->isDivergent()) {
2623 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2624 MVT::i32, SrcVal),
2625 0);
2626 }
2627
2628 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2629 {SrcVal, Log2WaveSize}),
2630 0);
2631 }
2632
2633 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2634 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2635}
2636
2637bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2638 unsigned &Mods,
2639 bool IsCanonicalizing,
2640 bool AllowAbs) const {
2641 Mods = SISrcMods::NONE;
2642 Src = In;
2643
2644 if (Src.getOpcode() == ISD::FNEG) {
2645 Mods |= SISrcMods::NEG;
2646 Src = Src.getOperand(0);
2647 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2648 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2649 // denormal mode, but we're implicitly canonicalizing in a source operand.
2650 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2651 if (LHS && LHS->isZero()) {
2652 Mods |= SISrcMods::NEG;
2653 Src = Src.getOperand(1);
2654 }
2655 }
2656
2657 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2658 Mods |= SISrcMods::ABS;
2659 Src = Src.getOperand(0);
2660 }
2661
2662 return true;
2663}
2664
2665bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2666 SDValue &SrcMods) const {
2667 unsigned Mods;
2668 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2669 /*AllowAbs=*/true)) {
2670 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2671 return true;
2672 }
2673
2674 return false;
2675}
2676
2677bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2678 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2679 unsigned Mods;
2680 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2681 /*AllowAbs=*/true)) {
2682 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2683 return true;
2684 }
2685
2686 return false;
2687}
2688
2689bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2690 SDValue &SrcMods) const {
2691 unsigned Mods;
2692 if (SelectVOP3ModsImpl(In, Src, Mods,
2693 /*IsCanonicalizing=*/true,
2694 /*AllowAbs=*/false)) {
2695 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2696 return true;
2697 }
2698
2699 return false;
2700}
2701
2702bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2703 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2704 return false;
2705
2706 Src = In;
2707 return true;
2708}
2709
2710bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2711 SDValue &SrcMods,
2712 bool OpSel) const {
2713 unsigned Mods;
2714 if (SelectVOP3ModsImpl(In, Src, Mods,
2715 /*IsCanonicalizing=*/true,
2716 /*AllowAbs=*/false)) {
2717 if (OpSel)
2718 Mods |= SISrcMods::OP_SEL_0;
2719 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2720 return true;
2721 }
2722
2723 return false;
2724}
2725
2726bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2727 SDValue &SrcMods) const {
2728 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2729}
2730
2731bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2732 SDValue &SrcMods) const {
2733 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2734}
2735
2736bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2737 SDValue &SrcMods, SDValue &Clamp,
2738 SDValue &Omod) const {
2739 SDLoc DL(In);
2740 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2741 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2742
2743 return SelectVOP3Mods(In, Src, SrcMods);
2744}
2745
2746bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2747 SDValue &SrcMods, SDValue &Clamp,
2748 SDValue &Omod) const {
2749 SDLoc DL(In);
2750 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2751 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2752
2753 return SelectVOP3BMods(In, Src, SrcMods);
2754}
2755
2756bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2757 SDValue &Clamp, SDValue &Omod) const {
2758 Src = In;
2759
2760 SDLoc DL(In);
2761 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2762 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2763
2764 return true;
2765}
2766
2767bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2768 SDValue &SrcMods, bool IsDOT) const {
2769 unsigned Mods = SISrcMods::NONE;
2770 Src = In;
2771
2772 // TODO: Handle G_FSUB 0 as fneg
2773 if (Src.getOpcode() == ISD::FNEG) {
2775 Src = Src.getOperand(0);
2776 }
2777
2778 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2779 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2780 unsigned VecMods = Mods;
2781
2782 SDValue Lo = stripBitcast(Src.getOperand(0));
2783 SDValue Hi = stripBitcast(Src.getOperand(1));
2784
2785 if (Lo.getOpcode() == ISD::FNEG) {
2786 Lo = stripBitcast(Lo.getOperand(0));
2787 Mods ^= SISrcMods::NEG;
2788 }
2789
2790 if (Hi.getOpcode() == ISD::FNEG) {
2791 Hi = stripBitcast(Hi.getOperand(0));
2792 Mods ^= SISrcMods::NEG_HI;
2793 }
2794
2795 if (isExtractHiElt(Lo, Lo))
2796 Mods |= SISrcMods::OP_SEL_0;
2797
2798 if (isExtractHiElt(Hi, Hi))
2799 Mods |= SISrcMods::OP_SEL_1;
2800
2801 unsigned VecSize = Src.getValueSizeInBits();
2802 Lo = stripExtractLoElt(Lo);
2803 Hi = stripExtractLoElt(Hi);
2804
2805 if (Lo.getValueSizeInBits() > VecSize) {
2807 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2808 MVT::getIntegerVT(VecSize), Lo);
2809 }
2810
2811 if (Hi.getValueSizeInBits() > VecSize) {
2813 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2814 MVT::getIntegerVT(VecSize), Hi);
2815 }
2816
2817 assert(Lo.getValueSizeInBits() <= VecSize &&
2818 Hi.getValueSizeInBits() <= VecSize);
2819
2820 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2821 // Really a scalar input. Just select from the low half of the register to
2822 // avoid packing.
2823
2824 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2825 Src = Lo;
2826 } else {
2827 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2828
2829 SDLoc SL(In);
2831 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2832 Lo.getValueType()), 0);
2833 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2834 : AMDGPU::SReg_64RegClassID;
2835 const SDValue Ops[] = {
2836 CurDAG->getTargetConstant(RC, SL, MVT::i32),
2837 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2838 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2839
2840 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2841 Src.getValueType(), Ops), 0);
2842 }
2843 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2844 return true;
2845 }
2846
2847 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2848 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2849 .bitcastToAPInt().getZExtValue();
2850 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2851 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
2852 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2853 return true;
2854 }
2855 }
2856
2857 Mods = VecMods;
2858 }
2859
2860 // Packed instructions do not have abs modifiers.
2861 Mods |= SISrcMods::OP_SEL_1;
2862
2863 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2864 return true;
2865}
2866
2867bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
2868 SDValue &SrcMods) const {
2869 return SelectVOP3PMods(In, Src, SrcMods, true);
2870}
2871
2872bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
2873 const ConstantSDNode *C = cast<ConstantSDNode>(In);
2874 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
2875 // 1 promotes packed values to signed, 0 treats them as unsigned.
2876 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
2877
2878 unsigned Mods = SISrcMods::OP_SEL_1;
2879 unsigned SrcSign = C->getZExtValue();
2880 if (SrcSign == 1)
2881 Mods ^= SISrcMods::NEG;
2882
2883 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2884 return true;
2885}
2886
2887bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
2888 SDValue &Src) const {
2889 const ConstantSDNode *C = cast<ConstantSDNode>(In);
2890 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
2891
2892 unsigned Mods = SISrcMods::OP_SEL_1;
2893 unsigned SrcVal = C->getZExtValue();
2894 if (SrcVal == 1)
2895 Mods |= SISrcMods::OP_SEL_0;
2896
2897 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2898 return true;
2899}
2900
2901bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2902 SDValue &SrcMods) const {
2903 Src = In;
2904 // FIXME: Handle op_sel
2905 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2906 return true;
2907}
2908
2909bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2910 SDValue &SrcMods) const {
2911 // FIXME: Handle op_sel
2912 return SelectVOP3Mods(In, Src, SrcMods);
2913}
2914
2915// The return value is not whether the match is possible (which it always is),
2916// but whether or not it a conversion is really used.
2917bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2918 unsigned &Mods) const {
2919 Mods = 0;
2920 SelectVOP3ModsImpl(In, Src, Mods);
2921
2922 if (Src.getOpcode() == ISD::FP_EXTEND) {
2923 Src = Src.getOperand(0);
2924 assert(Src.getValueType() == MVT::f16);
2925 Src = stripBitcast(Src);
2926
2927 // Be careful about folding modifiers if we already have an abs. fneg is
2928 // applied last, so we don't want to apply an earlier fneg.
2929 if ((Mods & SISrcMods::ABS) == 0) {
2930 unsigned ModsTmp;
2931 SelectVOP3ModsImpl(Src, Src, ModsTmp);
2932
2933 if ((ModsTmp & SISrcMods::NEG) != 0)
2934 Mods ^= SISrcMods::NEG;
2935
2936 if ((ModsTmp & SISrcMods::ABS) != 0)
2937 Mods |= SISrcMods::ABS;
2938 }
2939
2940 // op_sel/op_sel_hi decide the source type and source.
2941 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2942 // If the sources's op_sel is set, it picks the high half of the source
2943 // register.
2944
2945 Mods |= SISrcMods::OP_SEL_1;
2946 if (isExtractHiElt(Src, Src)) {
2947 Mods |= SISrcMods::OP_SEL_0;
2948
2949 // TODO: Should we try to look for neg/abs here?
2950 }
2951
2952 return true;
2953 }
2954
2955 return false;
2956}
2957
2958bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
2959 SDValue &SrcMods) const {
2960 unsigned Mods = 0;
2961 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
2962 return false;
2963 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2964 return true;
2965}
2966
2967bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2968 SDValue &SrcMods) const {
2969 unsigned Mods = 0;
2970 SelectVOP3PMadMixModsImpl(In, Src, Mods);
2971 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2972 return true;
2973}
2974
2975SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2976 if (In.isUndef())
2977 return CurDAG->getUNDEF(MVT::i32);
2978
2979 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2980 SDLoc SL(In);
2981 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2982 }
2983
2984 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2985 SDLoc SL(In);
2986 return CurDAG->getConstant(
2987 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2988 }
2989
2990 SDValue Src;
2991 if (isExtractHiElt(In, Src))
2992 return Src;
2993
2994 return SDValue();
2995}
2996
2997bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2999
3000 const SIRegisterInfo *SIRI =
3001 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3002 const SIInstrInfo * SII =
3003 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3004
3005 unsigned Limit = 0;
3006 bool AllUsesAcceptSReg = true;
3007 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3008 Limit < 10 && U != E; ++U, ++Limit) {
3009 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3010
3011 // If the register class is unknown, it could be an unknown
3012 // register class that needs to be an SGPR, e.g. an inline asm
3013 // constraint
3014 if (!RC || SIRI->isSGPRClass(RC))
3015 return false;
3016
3017 if (RC != &AMDGPU::VS_32RegClass) {
3018 AllUsesAcceptSReg = false;
3019 SDNode * User = *U;
3020 if (User->isMachineOpcode()) {
3021 unsigned Opc = User->getMachineOpcode();
3022 const MCInstrDesc &Desc = SII->get(Opc);
3023 if (Desc.isCommutable()) {
3024 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3025 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3026 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3027 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3028 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3029 if (CommutedRC == &AMDGPU::VS_32RegClass)
3030 AllUsesAcceptSReg = true;
3031 }
3032 }
3033 }
3034 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3035 // commuting current user. This means have at least one use
3036 // that strictly require VGPR. Thus, we will not attempt to commute
3037 // other user instructions.
3038 if (!AllUsesAcceptSReg)
3039 break;
3040 }
3041 }
3042 return !AllUsesAcceptSReg && (Limit < 10);
3043}
3044
3045bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
3046 auto Ld = cast<LoadSDNode>(N);
3047
3048 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
3049 return false;
3050
3051 return Ld->getAlign() >= Align(4) &&
3052 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3053 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3054 (Subtarget->getScalarizeGlobalBehavior() &&
3055 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3056 Ld->isSimple() &&
3057 static_cast<const SITargetLowering *>(getTargetLowering())
3058 ->isMemOpHasNoClobberedMemOperand(N)));
3059}
3060
3063 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3064 bool IsModified = false;
3065 do {
3066 IsModified = false;
3067
3068 // Go over all selected nodes and try to fold them a bit more
3070 while (Position != CurDAG->allnodes_end()) {
3071 SDNode *Node = &*Position++;
3072 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3073 if (!MachineNode)
3074 continue;
3075
3076 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3077 if (ResNode != Node) {
3078 if (ResNode)
3079 ReplaceUses(Node, ResNode);
3080 IsModified = true;
3081 }
3082 }
3084 } while (IsModified);
3085}
3086
3087char AMDGPUDAGToDAGISel::ID = 0;
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1485
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1600
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:305
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:431
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:435
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:588
bool hasDLInsts() const
Definition: GCNSubtarget.h:711
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:235
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
Definition: GCNSubtarget.h:960
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:516
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:247
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:642
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:630
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:850
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:652
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:494
Generation getGeneration() const
Definition: GCNSubtarget.h:286
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:666
bool hasAddr64() const
Definition: GCNSubtarget.h:346
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:674
bool hasSALUFloatInsts() const
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:594
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isDivergent() const
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static bool isLegalMUBUFImmOffset(unsigned Imm)
Definition: SIInstrInfo.h:1198
static unsigned getMaxMUBUFImmOffset()
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:531
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:532
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:725
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:771
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:674
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:355
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
Iterator for intrusive lists based on ilist_node.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:398
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:392
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:395
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:394
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:390
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:391
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:396
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1121
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:269
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1026
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:787
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:900
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:934
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:925
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:777
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:966
@ TargetFrameIndex
Definition: ISDOpcodes.h:166
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:795
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:885
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:279
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:866
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:783
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1065
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1503
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:440
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:349
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:240
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
unsigned M0(unsigned Val)
Definition: VE.h:375
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:351
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:239
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:311
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:319
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:292
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:136
static KnownBits computeForAddSub(bool Add, bool NSW, const KnownBits &LHS, KnownBits RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.