LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
68}
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 bool IsSGPR = TRI.isSGPRClass(SrcRC);
165 unsigned AndOpc =
166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168 .addImm(1)
169 .addReg(SrcReg);
170 if (IsSGPR)
171 And.setOperandDead(3); // Dead scc
172
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174 .addImm(0)
175 .addReg(MaskedReg);
176 }
177
178 if (!MRI->getRegClassOrNull(SrcReg))
179 MRI->setRegClass(SrcReg, SrcRC);
180 I.eraseFromParent();
181 return true;
182 }
183
184 const TargetRegisterClass *RC =
186 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
187 return false;
188
189 return true;
190 }
191
192 for (const MachineOperand &MO : I.operands()) {
193 if (MO.getReg().isPhysical())
194 continue;
195
196 const TargetRegisterClass *RC =
198 if (!RC)
199 continue;
200 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
201 }
202 return true;
203}
204
205bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206 const Register DefReg = I.getOperand(0).getReg();
207 const LLT DefTy = MRI->getType(DefReg);
208
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213 if (DefTy == LLT::scalar(1))
214 return false;
215
216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217
218 const RegClassOrRegBank &RegClassOrBank =
219 MRI->getRegClassOrRegBank(DefReg);
220
221 const TargetRegisterClass *DefRC
222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223 if (!DefRC) {
224 if (!DefTy.isValid()) {
225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226 return false;
227 }
228
229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
231 if (!DefRC) {
232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233 return false;
234 }
235 }
236
237 // TODO: Verify that all registers have the same bank
238 I.setDesc(TII.get(TargetOpcode::PHI));
239 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
240}
241
243AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244 const TargetRegisterClass &SubRC,
245 unsigned SubIdx) const {
246
247 MachineInstr *MI = MO.getParent();
249 Register DstReg = MRI->createVirtualRegister(&SubRC);
250
251 if (MO.isReg()) {
252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
253 Register Reg = MO.getReg();
254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
255 .addReg(Reg, 0, ComposedSubIdx);
256
257 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
258 MO.isKill(), MO.isDead(), MO.isUndef(),
259 MO.isEarlyClobber(), 0, MO.isDebug(),
260 MO.isInternalRead());
261 }
262
263 assert(MO.isImm());
264
265 APInt Imm(64, MO.getImm());
266
267 switch (SubIdx) {
268 default:
269 llvm_unreachable("do not know to split immediate with this sub index.");
270 case AMDGPU::sub0:
271 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
272 case AMDGPU::sub1:
273 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
274 }
275}
276
277static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278 switch (Opc) {
279 case AMDGPU::G_AND:
280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281 case AMDGPU::G_OR:
282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283 case AMDGPU::G_XOR:
284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285 default:
286 llvm_unreachable("not a bit op");
287 }
288}
289
290bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
293
294 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
297 return false;
298
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300 STI.isWave64());
301 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
302
303 // Dead implicit-def of scc
304 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
305 true, // isImp
306 false, // isKill
307 true)); // isDead
308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309}
310
311bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312 MachineBasicBlock *BB = I.getParent();
314 Register DstReg = I.getOperand(0).getReg();
315 const DebugLoc &DL = I.getDebugLoc();
316 LLT Ty = MRI->getType(DstReg);
317 if (Ty.isVector())
318 return false;
319
320 unsigned Size = Ty.getSizeInBits();
321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324
325 if (Size == 32) {
326 if (IsSALU) {
327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
329 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
330 .add(I.getOperand(1))
331 .add(I.getOperand(2))
332 .setOperandDead(3); // Dead scc
333 I.eraseFromParent();
334 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
335 }
336
337 if (STI.hasAddNoCarry()) {
338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339 I.setDesc(TII.get(Opc));
340 I.addOperand(*MF, MachineOperand::CreateImm(0));
341 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343 }
344
345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346
347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
349 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
350 .addDef(UnusedCarry, RegState::Dead)
351 .add(I.getOperand(1))
352 .add(I.getOperand(2))
353 .addImm(0);
354 I.eraseFromParent();
355 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
356 }
357
358 assert(!Sub && "illegal sub should not reach here");
359
360 const TargetRegisterClass &RC
361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362 const TargetRegisterClass &HalfRC
363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364
365 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
366 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
367 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
368 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
369
370 Register DstLo = MRI->createVirtualRegister(&HalfRC);
371 Register DstHi = MRI->createVirtualRegister(&HalfRC);
372
373 if (IsSALU) {
374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
375 .add(Lo1)
376 .add(Lo2);
377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
378 .add(Hi1)
379 .add(Hi2)
380 .setOperandDead(3); // Dead scc
381 } else {
382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383 Register CarryReg = MRI->createVirtualRegister(CarryRC);
384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
385 .addDef(CarryReg)
386 .add(Lo1)
387 .add(Lo2)
388 .addImm(0);
389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
391 .add(Hi1)
392 .add(Hi2)
393 .addReg(CarryReg, RegState::Kill)
394 .addImm(0);
395
396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
397 return false;
398 }
399
400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
401 .addReg(DstLo)
402 .addImm(AMDGPU::sub0)
403 .addReg(DstHi)
404 .addImm(AMDGPU::sub1);
405
406
407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
408 return false;
409
410 I.eraseFromParent();
411 return true;
412}
413
414bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415 MachineInstr &I) const {
416 MachineBasicBlock *BB = I.getParent();
418 const DebugLoc &DL = I.getDebugLoc();
419 Register Dst0Reg = I.getOperand(0).getReg();
420 Register Dst1Reg = I.getOperand(1).getReg();
421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422 I.getOpcode() == AMDGPU::G_UADDE;
423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424 I.getOpcode() == AMDGPU::G_USUBE;
425
426 if (isVCC(Dst1Reg, *MRI)) {
427 unsigned NoCarryOpc =
428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
431 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
432 I.addOperand(*MF, MachineOperand::CreateImm(0));
433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434 }
435
436 Register Src0Reg = I.getOperand(2).getReg();
437 Register Src1Reg = I.getOperand(3).getReg();
438
439 if (HasCarryIn) {
440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
441 .addReg(I.getOperand(4).getReg());
442 }
443
444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446
447 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
448 .add(I.getOperand(2))
449 .add(I.getOperand(3));
450
451 if (MRI->use_nodbg_empty(Dst1Reg)) {
452 CarryInst.setOperandDead(3); // Dead scc
453 } else {
454 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
455 .addReg(AMDGPU::SCC);
456 if (!MRI->getRegClassOrNull(Dst1Reg))
457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
458 }
459
460 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
462 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
463 return false;
464
465 if (HasCarryIn &&
466 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
467 AMDGPU::SReg_32RegClass, *MRI))
468 return false;
469
470 I.eraseFromParent();
471 return true;
472}
473
474bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475 MachineInstr &I) const {
476 MachineBasicBlock *BB = I.getParent();
478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479
480 unsigned Opc;
481 if (Subtarget->hasMADIntraFwdBug())
482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484 else
485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486 I.setDesc(TII.get(Opc));
487 I.addOperand(*MF, MachineOperand::CreateImm(0));
488 I.addImplicitDefUseOperands(*MF);
489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490}
491
492// TODO: We should probably legalize these to only using 32-bit results.
493bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494 MachineBasicBlock *BB = I.getParent();
495 Register DstReg = I.getOperand(0).getReg();
496 Register SrcReg = I.getOperand(1).getReg();
497 LLT DstTy = MRI->getType(DstReg);
498 LLT SrcTy = MRI->getType(SrcReg);
499 const unsigned SrcSize = SrcTy.getSizeInBits();
500 unsigned DstSize = DstTy.getSizeInBits();
501
502 // TODO: Should handle any multiple of 32 offset.
503 unsigned Offset = I.getOperand(2).getImm();
504 if (Offset % 32 != 0 || DstSize > 128)
505 return false;
506
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509 if (DstSize == 16)
510 DstSize = 32;
511
512 const TargetRegisterClass *DstRC =
513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
515 return false;
516
517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
518 const TargetRegisterClass *SrcRC =
519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
520 if (!SrcRC)
521 return false;
523 DstSize / 32);
524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525 if (!SrcRC)
526 return false;
527
528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
529 *SrcRC, I.getOperand(1));
530 const DebugLoc &DL = I.getDebugLoc();
531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
532 .addReg(SrcReg, 0, SubReg);
533
534 I.eraseFromParent();
535 return true;
536}
537
538bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 Register DstReg = MI.getOperand(0).getReg();
541 LLT DstTy = MRI->getType(DstReg);
542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
543
544 const unsigned SrcSize = SrcTy.getSizeInBits();
545 if (SrcSize < 32)
546 return selectImpl(MI, *CoverageInfo);
547
548 const DebugLoc &DL = MI.getDebugLoc();
549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
550 const unsigned DstSize = DstTy.getSizeInBits();
551 const TargetRegisterClass *DstRC =
552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
553 if (!DstRC)
554 return false;
555
556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560 MachineOperand &Src = MI.getOperand(I + 1);
561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
562 MIB.addImm(SubRegs[I]);
563
564 const TargetRegisterClass *SrcRC
565 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
567 return false;
568 }
569
570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
571 return false;
572
573 MI.eraseFromParent();
574 return true;
575}
576
577bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578 MachineBasicBlock *BB = MI.getParent();
579 const int NumDst = MI.getNumOperands() - 1;
580
581 MachineOperand &Src = MI.getOperand(NumDst);
582
583 Register SrcReg = Src.getReg();
584 Register DstReg0 = MI.getOperand(0).getReg();
585 LLT DstTy = MRI->getType(DstReg0);
586 LLT SrcTy = MRI->getType(SrcReg);
587
588 const unsigned DstSize = DstTy.getSizeInBits();
589 const unsigned SrcSize = SrcTy.getSizeInBits();
590 const DebugLoc &DL = MI.getDebugLoc();
591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
592
593 const TargetRegisterClass *SrcRC =
594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
596 return false;
597
598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599 // source, and this relies on the fact that the same subregister indices are
600 // used for both.
601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
602 for (int I = 0, E = NumDst; I != E; ++I) {
603 MachineOperand &Dst = MI.getOperand(I);
604 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
605 .addReg(SrcReg, 0, SubRegs[I]);
606
607 // Make sure the subregister index is valid for the source register.
608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
610 return false;
611
612 const TargetRegisterClass *DstRC =
614 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
615 return false;
616 }
617
618 MI.eraseFromParent();
619 return true;
620}
621
622bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625
626 Register Src0 = MI.getOperand(1).getReg();
627 Register Src1 = MI.getOperand(2).getReg();
628 LLT SrcTy = MRI->getType(Src0);
629 const unsigned SrcSize = SrcTy.getSizeInBits();
630
631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633 return selectG_MERGE_VALUES(MI);
634 }
635
636 // Selection logic below is for V2S16 only.
637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638 Register Dst = MI.getOperand(0).getReg();
639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641 SrcTy != LLT::scalar(32)))
642 return selectImpl(MI, *CoverageInfo);
643
644 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646 return false;
647
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651
652 const DebugLoc &DL = MI.getDebugLoc();
653 MachineBasicBlock *BB = MI.getParent();
654
655 // First, before trying TableGen patterns, check if both sources are
656 // constants. In those cases, we can trivially compute the final constant
657 // and emit a simple move.
658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
659 if (ConstSrc1) {
660 auto ConstSrc0 =
661 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
662 if (ConstSrc0) {
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667 uint32_t Imm = Lo16 | (Hi16 << 16);
668
669 // VALU
670 if (IsVector) {
671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
672 MI.eraseFromParent();
673 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
674 }
675
676 // SALU
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
680 }
681 }
682
683 // Now try TableGen patterns.
684 if (selectImpl(MI, *CoverageInfo))
685 return true;
686
687 // TODO: This should probably be a combine somewhere
688 // (build_vector $src0, undef) -> copy $src0
689 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691 MI.setDesc(TII.get(AMDGPU::COPY));
692 MI.removeOperand(2);
693 const auto &RC =
694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
696 RBI.constrainGenericRegister(Src0, RC, *MRI);
697 }
698
699 // TODO: Can be improved?
700 if (IsVector) {
701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
702 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
703 .addImm(0xFFFF)
704 .addReg(Src0);
705 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
706 return false;
707
708 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
709 .addReg(Src1)
710 .addImm(16)
711 .addReg(TmpReg);
712 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
713 return false;
714
715 MI.eraseFromParent();
716 return true;
717 }
718
719 Register ShiftSrc0;
720 Register ShiftSrc1;
721
722 // With multiple uses of the shift, this will duplicate the shift and
723 // increase register pressure.
724 //
725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726 // => (S_PACK_HH_B32_B16 $src0, $src1)
727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728 // => (S_PACK_HL_B32_B16 $src0, $src1)
729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730 // => (S_PACK_LH_B32_B16 $src0, $src1)
731 // (build_vector $src0, $src1)
732 // => (S_PACK_LL_B32_B16 $src0, $src1)
733
734 bool Shift0 = mi_match(
735 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
736
737 bool Shift1 = mi_match(
738 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
739
740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741 if (Shift0 && Shift1) {
742 Opc = AMDGPU::S_PACK_HH_B32_B16;
743 MI.getOperand(1).setReg(ShiftSrc0);
744 MI.getOperand(2).setReg(ShiftSrc1);
745 } else if (Shift1) {
746 Opc = AMDGPU::S_PACK_LH_B32_B16;
747 MI.getOperand(2).setReg(ShiftSrc1);
748 } else if (Shift0) {
749 auto ConstSrc1 =
750 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
754 .addReg(ShiftSrc0)
755 .addImm(16)
756 .setOperandDead(3); // Dead scc
757
758 MI.eraseFromParent();
759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
760 }
761 if (STI.hasSPackHL()) {
762 Opc = AMDGPU::S_PACK_HL_B32_B16;
763 MI.getOperand(1).setReg(ShiftSrc0);
764 }
765 }
766
767 MI.setDesc(TII.get(Opc));
768 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
769}
770
771bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
772 const MachineOperand &MO = I.getOperand(0);
773
774 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
775 // regbank check here is to know why getConstrainedRegClassForOperand failed.
777 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
778 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
779 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
780 return true;
781 }
782
783 return false;
784}
785
786bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
787 MachineBasicBlock *BB = I.getParent();
788
789 Register DstReg = I.getOperand(0).getReg();
790 Register Src0Reg = I.getOperand(1).getReg();
791 Register Src1Reg = I.getOperand(2).getReg();
792 LLT Src1Ty = MRI->getType(Src1Reg);
793
794 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
795 unsigned InsSize = Src1Ty.getSizeInBits();
796
797 int64_t Offset = I.getOperand(3).getImm();
798
799 // FIXME: These cases should have been illegal and unnecessary to check here.
800 if (Offset % 32 != 0 || InsSize % 32 != 0)
801 return false;
802
803 // Currently not handled by getSubRegFromChannel.
804 if (InsSize > 128)
805 return false;
806
807 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
808 if (SubReg == AMDGPU::NoSubRegister)
809 return false;
810
811 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
812 const TargetRegisterClass *DstRC =
813 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
814 if (!DstRC)
815 return false;
816
817 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
818 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
819 const TargetRegisterClass *Src0RC =
820 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
821 const TargetRegisterClass *Src1RC =
822 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
823
824 // Deal with weird cases where the class only partially supports the subreg
825 // index.
826 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
827 if (!Src0RC || !Src1RC)
828 return false;
829
830 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
831 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
832 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
833 return false;
834
835 const DebugLoc &DL = I.getDebugLoc();
836 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
837 .addReg(Src0Reg)
838 .addReg(Src1Reg)
839 .addImm(SubReg);
840
841 I.eraseFromParent();
842 return true;
843}
844
845bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
846 Register DstReg = MI.getOperand(0).getReg();
847 Register SrcReg = MI.getOperand(1).getReg();
848 Register OffsetReg = MI.getOperand(2).getReg();
849 Register WidthReg = MI.getOperand(3).getReg();
850
851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
852 "scalar BFX instructions are expanded in regbankselect");
853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854 "64-bit vector BFX instructions are expanded in regbankselect");
855
856 const DebugLoc &DL = MI.getDebugLoc();
857 MachineBasicBlock *MBB = MI.getParent();
858
859 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
860 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
861 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
862 .addReg(SrcReg)
863 .addReg(OffsetReg)
864 .addReg(WidthReg);
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867}
868
869bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
870 if (STI.getLDSBankCount() != 16)
871 return selectImpl(MI, *CoverageInfo);
872
873 Register Dst = MI.getOperand(0).getReg();
874 Register Src0 = MI.getOperand(2).getReg();
875 Register M0Val = MI.getOperand(6).getReg();
876 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
877 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
878 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
879 return false;
880
881 // This requires 2 instructions. It is possible to write a pattern to support
882 // this, but the generated isel emitter doesn't correctly deal with multiple
883 // output instructions using the same physical register input. The copy to m0
884 // is incorrectly placed before the second instruction.
885 //
886 // TODO: Match source modifiers.
887
888 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
889 const DebugLoc &DL = MI.getDebugLoc();
890 MachineBasicBlock *MBB = MI.getParent();
891
892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
893 .addReg(M0Val);
894 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
895 .addImm(2)
896 .addImm(MI.getOperand(4).getImm()) // $attr
897 .addImm(MI.getOperand(3).getImm()); // $attrchan
898
899 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
900 .addImm(0) // $src0_modifiers
901 .addReg(Src0) // $src0
902 .addImm(MI.getOperand(4).getImm()) // $attr
903 .addImm(MI.getOperand(3).getImm()) // $attrchan
904 .addImm(0) // $src2_modifiers
905 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
906 .addImm(MI.getOperand(5).getImm()) // $high
907 .addImm(0) // $clamp
908 .addImm(0); // $omod
909
910 MI.eraseFromParent();
911 return true;
912}
913
914// Writelane is special in that it can use SGPR and M0 (which would normally
915// count as using the constant bus twice - but in this case it is allowed since
916// the lane selector doesn't count as a use of the constant bus). However, it is
917// still required to abide by the 1 SGPR rule. Fix this up if we might have
918// multiple SGPRs.
919bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
920 // With a constant bus limit of at least 2, there's no issue.
921 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
922 return selectImpl(MI, *CoverageInfo);
923
924 MachineBasicBlock *MBB = MI.getParent();
925 const DebugLoc &DL = MI.getDebugLoc();
926 Register VDst = MI.getOperand(0).getReg();
927 Register Val = MI.getOperand(2).getReg();
928 Register LaneSelect = MI.getOperand(3).getReg();
929 Register VDstIn = MI.getOperand(4).getReg();
930
931 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
932
933 std::optional<ValueAndVReg> ConstSelect =
934 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
935 if (ConstSelect) {
936 // The selector has to be an inline immediate, so we can use whatever for
937 // the other operands.
938 MIB.addReg(Val);
939 MIB.addImm(ConstSelect->Value.getSExtValue() &
940 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
941 } else {
942 std::optional<ValueAndVReg> ConstVal =
944
945 // If the value written is an inline immediate, we can get away without a
946 // copy to m0.
947 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
948 STI.hasInv2PiInlineImm())) {
949 MIB.addImm(ConstVal->Value.getSExtValue());
950 MIB.addReg(LaneSelect);
951 } else {
952 MIB.addReg(Val);
953
954 // If the lane selector was originally in a VGPR and copied with
955 // readfirstlane, there's a hazard to read the same SGPR from the
956 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
957 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
958
959 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
960 .addReg(LaneSelect);
961 MIB.addReg(AMDGPU::M0);
962 }
963 }
964
965 MIB.addReg(VDstIn);
966
967 MI.eraseFromParent();
968 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
969}
970
971// We need to handle this here because tablegen doesn't support matching
972// instructions with multiple outputs.
973bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
974 Register Dst0 = MI.getOperand(0).getReg();
975 Register Dst1 = MI.getOperand(1).getReg();
976
977 LLT Ty = MRI->getType(Dst0);
978 unsigned Opc;
979 if (Ty == LLT::scalar(32))
980 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
981 else if (Ty == LLT::scalar(64))
982 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
983 else
984 return false;
985
986 // TODO: Match source modifiers.
987
988 const DebugLoc &DL = MI.getDebugLoc();
989 MachineBasicBlock *MBB = MI.getParent();
990
991 Register Numer = MI.getOperand(3).getReg();
992 Register Denom = MI.getOperand(4).getReg();
993 unsigned ChooseDenom = MI.getOperand(5).getImm();
994
995 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
996
997 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
998 .addDef(Dst1)
999 .addImm(0) // $src0_modifiers
1000 .addUse(Src0) // $src0
1001 .addImm(0) // $src1_modifiers
1002 .addUse(Denom) // $src1
1003 .addImm(0) // $src2_modifiers
1004 .addUse(Numer) // $src2
1005 .addImm(0) // $clamp
1006 .addImm(0); // $omod
1007
1008 MI.eraseFromParent();
1009 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1010}
1011
1012bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1013 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1014 switch (IntrinsicID) {
1015 case Intrinsic::amdgcn_if_break: {
1016 MachineBasicBlock *BB = I.getParent();
1017
1018 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1019 // SelectionDAG uses for wave32 vs wave64.
1020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1021 .add(I.getOperand(0))
1022 .add(I.getOperand(2))
1023 .add(I.getOperand(3));
1024
1025 Register DstReg = I.getOperand(0).getReg();
1026 Register Src0Reg = I.getOperand(2).getReg();
1027 Register Src1Reg = I.getOperand(3).getReg();
1028
1029 I.eraseFromParent();
1030
1031 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1032 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1033
1034 return true;
1035 }
1036 case Intrinsic::amdgcn_interp_p1_f16:
1037 return selectInterpP1F16(I);
1038 case Intrinsic::amdgcn_wqm:
1039 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1040 case Intrinsic::amdgcn_softwqm:
1041 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1042 case Intrinsic::amdgcn_strict_wwm:
1043 case Intrinsic::amdgcn_wwm:
1044 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1045 case Intrinsic::amdgcn_strict_wqm:
1046 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1047 case Intrinsic::amdgcn_writelane:
1048 return selectWritelane(I);
1049 case Intrinsic::amdgcn_div_scale:
1050 return selectDivScale(I);
1051 case Intrinsic::amdgcn_icmp:
1052 case Intrinsic::amdgcn_fcmp:
1053 if (selectImpl(I, *CoverageInfo))
1054 return true;
1055 return selectIntrinsicCmp(I);
1056 case Intrinsic::amdgcn_ballot:
1057 return selectBallot(I);
1058 case Intrinsic::amdgcn_reloc_constant:
1059 return selectRelocConstant(I);
1060 case Intrinsic::amdgcn_groupstaticsize:
1061 return selectGroupStaticSize(I);
1062 case Intrinsic::returnaddress:
1063 return selectReturnAddress(I);
1064 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1065 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1066 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1067 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1068 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1069 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1070 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1071 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1072 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1073 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1074 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1075 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1076 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1078 return selectSMFMACIntrin(I);
1079 default:
1080 return selectImpl(I, *CoverageInfo);
1081 }
1082}
1083
1085 const GCNSubtarget &ST) {
1086 if (Size != 16 && Size != 32 && Size != 64)
1087 return -1;
1088
1089 if (Size == 16 && !ST.has16BitInsts())
1090 return -1;
1091
1092 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1093 unsigned S64Opc) {
1094 if (Size == 16)
1095 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1096 if (Size == 32)
1097 return S32Opc;
1098 return S64Opc;
1099 };
1100
1101 switch (P) {
1102 default:
1103 llvm_unreachable("Unknown condition code!");
1104 case CmpInst::ICMP_NE:
1105 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1106 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1107 case CmpInst::ICMP_EQ:
1108 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1109 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1110 case CmpInst::ICMP_SGT:
1111 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1112 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1113 case CmpInst::ICMP_SGE:
1114 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1115 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1116 case CmpInst::ICMP_SLT:
1117 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1118 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1119 case CmpInst::ICMP_SLE:
1120 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1121 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1122 case CmpInst::ICMP_UGT:
1123 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1124 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1125 case CmpInst::ICMP_UGE:
1126 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1127 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1128 case CmpInst::ICMP_ULT:
1129 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1130 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1131 case CmpInst::ICMP_ULE:
1132 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1133 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1134
1135 case CmpInst::FCMP_OEQ:
1136 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1137 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1138 case CmpInst::FCMP_OGT:
1139 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1140 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1141 case CmpInst::FCMP_OGE:
1142 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1143 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1144 case CmpInst::FCMP_OLT:
1145 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1146 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1147 case CmpInst::FCMP_OLE:
1148 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1149 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1150 case CmpInst::FCMP_ONE:
1151 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1152 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1153 case CmpInst::FCMP_ORD:
1154 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1155 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1156 case CmpInst::FCMP_UNO:
1157 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1158 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1159 case CmpInst::FCMP_UEQ:
1160 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1161 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1162 case CmpInst::FCMP_UGT:
1163 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1164 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1165 case CmpInst::FCMP_UGE:
1166 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1167 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1168 case CmpInst::FCMP_ULT:
1169 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1170 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1171 case CmpInst::FCMP_ULE:
1172 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1173 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1174 case CmpInst::FCMP_UNE:
1175 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1176 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1177 case CmpInst::FCMP_TRUE:
1178 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1179 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1181 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1182 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1183 }
1184}
1185
1186int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1187 unsigned Size) const {
1188 if (Size == 64) {
1189 if (!STI.hasScalarCompareEq64())
1190 return -1;
1191
1192 switch (P) {
1193 case CmpInst::ICMP_NE:
1194 return AMDGPU::S_CMP_LG_U64;
1195 case CmpInst::ICMP_EQ:
1196 return AMDGPU::S_CMP_EQ_U64;
1197 default:
1198 return -1;
1199 }
1200 }
1201
1202 if (Size == 32) {
1203 switch (P) {
1204 case CmpInst::ICMP_NE:
1205 return AMDGPU::S_CMP_LG_U32;
1206 case CmpInst::ICMP_EQ:
1207 return AMDGPU::S_CMP_EQ_U32;
1208 case CmpInst::ICMP_SGT:
1209 return AMDGPU::S_CMP_GT_I32;
1210 case CmpInst::ICMP_SGE:
1211 return AMDGPU::S_CMP_GE_I32;
1212 case CmpInst::ICMP_SLT:
1213 return AMDGPU::S_CMP_LT_I32;
1214 case CmpInst::ICMP_SLE:
1215 return AMDGPU::S_CMP_LE_I32;
1216 case CmpInst::ICMP_UGT:
1217 return AMDGPU::S_CMP_GT_U32;
1218 case CmpInst::ICMP_UGE:
1219 return AMDGPU::S_CMP_GE_U32;
1220 case CmpInst::ICMP_ULT:
1221 return AMDGPU::S_CMP_LT_U32;
1222 case CmpInst::ICMP_ULE:
1223 return AMDGPU::S_CMP_LE_U32;
1224 case CmpInst::FCMP_OEQ:
1225 return AMDGPU::S_CMP_EQ_F32;
1226 case CmpInst::FCMP_OGT:
1227 return AMDGPU::S_CMP_GT_F32;
1228 case CmpInst::FCMP_OGE:
1229 return AMDGPU::S_CMP_GE_F32;
1230 case CmpInst::FCMP_OLT:
1231 return AMDGPU::S_CMP_LT_F32;
1232 case CmpInst::FCMP_OLE:
1233 return AMDGPU::S_CMP_LE_F32;
1234 case CmpInst::FCMP_ONE:
1235 return AMDGPU::S_CMP_LG_F32;
1236 case CmpInst::FCMP_ORD:
1237 return AMDGPU::S_CMP_O_F32;
1238 case CmpInst::FCMP_UNO:
1239 return AMDGPU::S_CMP_U_F32;
1240 case CmpInst::FCMP_UEQ:
1241 return AMDGPU::S_CMP_NLG_F32;
1242 case CmpInst::FCMP_UGT:
1243 return AMDGPU::S_CMP_NLE_F32;
1244 case CmpInst::FCMP_UGE:
1245 return AMDGPU::S_CMP_NLT_F32;
1246 case CmpInst::FCMP_ULT:
1247 return AMDGPU::S_CMP_NGE_F32;
1248 case CmpInst::FCMP_ULE:
1249 return AMDGPU::S_CMP_NGT_F32;
1250 case CmpInst::FCMP_UNE:
1251 return AMDGPU::S_CMP_NEQ_F32;
1252 default:
1253 llvm_unreachable("Unknown condition code!");
1254 }
1255 }
1256
1257 if (Size == 16) {
1258 if (!STI.hasSALUFloatInsts())
1259 return -1;
1260
1261 switch (P) {
1262 case CmpInst::FCMP_OEQ:
1263 return AMDGPU::S_CMP_EQ_F16;
1264 case CmpInst::FCMP_OGT:
1265 return AMDGPU::S_CMP_GT_F16;
1266 case CmpInst::FCMP_OGE:
1267 return AMDGPU::S_CMP_GE_F16;
1268 case CmpInst::FCMP_OLT:
1269 return AMDGPU::S_CMP_LT_F16;
1270 case CmpInst::FCMP_OLE:
1271 return AMDGPU::S_CMP_LE_F16;
1272 case CmpInst::FCMP_ONE:
1273 return AMDGPU::S_CMP_LG_F16;
1274 case CmpInst::FCMP_ORD:
1275 return AMDGPU::S_CMP_O_F16;
1276 case CmpInst::FCMP_UNO:
1277 return AMDGPU::S_CMP_U_F16;
1278 case CmpInst::FCMP_UEQ:
1279 return AMDGPU::S_CMP_NLG_F16;
1280 case CmpInst::FCMP_UGT:
1281 return AMDGPU::S_CMP_NLE_F16;
1282 case CmpInst::FCMP_UGE:
1283 return AMDGPU::S_CMP_NLT_F16;
1284 case CmpInst::FCMP_ULT:
1285 return AMDGPU::S_CMP_NGE_F16;
1286 case CmpInst::FCMP_ULE:
1287 return AMDGPU::S_CMP_NGT_F16;
1288 case CmpInst::FCMP_UNE:
1289 return AMDGPU::S_CMP_NEQ_F16;
1290 default:
1291 llvm_unreachable("Unknown condition code!");
1292 }
1293 }
1294
1295 return -1;
1296}
1297
1298bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1299
1300 MachineBasicBlock *BB = I.getParent();
1301 const DebugLoc &DL = I.getDebugLoc();
1302
1303 Register SrcReg = I.getOperand(2).getReg();
1304 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1305
1306 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1307
1308 Register CCReg = I.getOperand(0).getReg();
1309 if (!isVCC(CCReg, *MRI)) {
1310 int Opcode = getS_CMPOpcode(Pred, Size);
1311 if (Opcode == -1)
1312 return false;
1313 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1314 .add(I.getOperand(2))
1315 .add(I.getOperand(3));
1316 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1317 .addReg(AMDGPU::SCC);
1318 bool Ret =
1319 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1320 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1321 I.eraseFromParent();
1322 return Ret;
1323 }
1324
1325 if (I.getOpcode() == AMDGPU::G_FCMP)
1326 return false;
1327
1328 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1329 if (Opcode == -1)
1330 return false;
1331
1332 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1333 I.getOperand(0).getReg())
1334 .add(I.getOperand(2))
1335 .add(I.getOperand(3));
1337 *TRI.getBoolRC(), *MRI);
1338 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1339 I.eraseFromParent();
1340 return Ret;
1341}
1342
1343bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1344 Register Dst = I.getOperand(0).getReg();
1345 if (isVCC(Dst, *MRI))
1346 return false;
1347
1348 LLT DstTy = MRI->getType(Dst);
1349 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1350 return false;
1351
1352 MachineBasicBlock *BB = I.getParent();
1353 const DebugLoc &DL = I.getDebugLoc();
1354 Register SrcReg = I.getOperand(2).getReg();
1355 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1356
1357 // i1 inputs are not supported in GlobalISel.
1358 if (Size == 1)
1359 return false;
1360
1361 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1362 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1363 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1364 I.eraseFromParent();
1365 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1366 }
1367
1368 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1369 if (Opcode == -1)
1370 return false;
1371
1372 MachineInstrBuilder SelectedMI;
1373 MachineOperand &LHS = I.getOperand(2);
1374 MachineOperand &RHS = I.getOperand(3);
1375 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1376 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1377 Register Src0Reg =
1378 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1379 Register Src1Reg =
1380 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1381 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1382 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1383 SelectedMI.addImm(Src0Mods);
1384 SelectedMI.addReg(Src0Reg);
1385 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1386 SelectedMI.addImm(Src1Mods);
1387 SelectedMI.addReg(Src1Reg);
1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1389 SelectedMI.addImm(0); // clamp
1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1391 SelectedMI.addImm(0); // op_sel
1392
1393 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1394 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1395 return false;
1396
1397 I.eraseFromParent();
1398 return true;
1399}
1400
1401bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1402 MachineBasicBlock *BB = I.getParent();
1403 const DebugLoc &DL = I.getDebugLoc();
1404 Register DstReg = I.getOperand(0).getReg();
1405 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1406 const bool Is64 = Size == 64;
1407 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1408
1409 // In the common case, the return type matches the wave size.
1410 // However we also support emitting i64 ballots in wave32 mode.
1411 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1412 return false;
1413
1414 std::optional<ValueAndVReg> Arg =
1415 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1416
1417 const auto BuildCopy = [&](Register SrcReg) {
1418 if (Size == STI.getWavefrontSize()) {
1419 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1420 .addReg(SrcReg);
1421 return;
1422 }
1423
1424 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1426 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1428 .addReg(SrcReg)
1429 .addImm(AMDGPU::sub0)
1430 .addReg(HiReg)
1431 .addImm(AMDGPU::sub1);
1432 };
1433
1434 if (Arg) {
1435 const int64_t Value = Arg->Value.getSExtValue();
1436 if (Value == 0) {
1437 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1438 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1439 } else if (Value == -1) // all ones
1440 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1441 else
1442 return false;
1443 } else
1444 BuildCopy(I.getOperand(2).getReg());
1445
1446 I.eraseFromParent();
1447 return true;
1448}
1449
1450bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1451 Register DstReg = I.getOperand(0).getReg();
1452 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1453 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1454 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1455 return false;
1456
1457 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1458
1460 const MDNode *Metadata = I.getOperand(2).getMetadata();
1461 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1462 auto RelocSymbol = cast<GlobalVariable>(
1463 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1464
1465 MachineBasicBlock *BB = I.getParent();
1466 BuildMI(*BB, &I, I.getDebugLoc(),
1467 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1469
1470 I.eraseFromParent();
1471 return true;
1472}
1473
1474bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1476
1477 Register DstReg = I.getOperand(0).getReg();
1478 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1479 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1480 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1481
1482 MachineBasicBlock *MBB = I.getParent();
1483 const DebugLoc &DL = I.getDebugLoc();
1484
1485 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1486
1487 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1489 MIB.addImm(MFI->getLDSSize());
1490 } else {
1492 const GlobalValue *GV
1493 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1495 }
1496
1497 I.eraseFromParent();
1498 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1499}
1500
1501bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1502 MachineBasicBlock *MBB = I.getParent();
1504 const DebugLoc &DL = I.getDebugLoc();
1505
1506 MachineOperand &Dst = I.getOperand(0);
1507 Register DstReg = Dst.getReg();
1508 unsigned Depth = I.getOperand(2).getImm();
1509
1510 const TargetRegisterClass *RC
1511 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1512 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1513 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1514 return false;
1515
1516 // Check for kernel and shader functions
1517 if (Depth != 0 ||
1519 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1520 .addImm(0);
1521 I.eraseFromParent();
1522 return true;
1523 }
1524
1526 // There is a call to @llvm.returnaddress in this function
1527 MFI.setReturnAddressIsTaken(true);
1528
1529 // Get the return address reg and mark it as an implicit live-in
1530 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1531 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1532 AMDGPU::SReg_64RegClass, DL);
1533 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1534 .addReg(LiveIn);
1535 I.eraseFromParent();
1536 return true;
1537}
1538
1539bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1540 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1541 // SelectionDAG uses for wave32 vs wave64.
1542 MachineBasicBlock *BB = MI.getParent();
1543 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1544 .add(MI.getOperand(1));
1545
1546 Register Reg = MI.getOperand(1).getReg();
1547 MI.eraseFromParent();
1548
1549 if (!MRI->getRegClassOrNull(Reg))
1550 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1551 return true;
1552}
1553
1554bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1555 MachineInstr &MI, Intrinsic::ID IntrID) const {
1556 MachineBasicBlock *MBB = MI.getParent();
1558 const DebugLoc &DL = MI.getDebugLoc();
1559
1560 unsigned IndexOperand = MI.getOperand(7).getImm();
1561 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1562 bool WaveDone = MI.getOperand(9).getImm() != 0;
1563
1564 if (WaveDone && !WaveRelease)
1565 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1566
1567 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1568 IndexOperand &= ~0x3f;
1569 unsigned CountDw = 0;
1570
1572 CountDw = (IndexOperand >> 24) & 0xf;
1573 IndexOperand &= ~(0xf << 24);
1574
1575 if (CountDw < 1 || CountDw > 4) {
1577 "ds_ordered_count: dword count must be between 1 and 4");
1578 }
1579 }
1580
1581 if (IndexOperand)
1582 report_fatal_error("ds_ordered_count: bad index operand");
1583
1584 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1585 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1586
1587 unsigned Offset0 = OrderedCountIndex << 2;
1588 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1589
1591 Offset1 |= (CountDw - 1) << 6;
1592
1594 Offset1 |= ShaderType << 2;
1595
1596 unsigned Offset = Offset0 | (Offset1 << 8);
1597
1598 Register M0Val = MI.getOperand(2).getReg();
1599 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1600 .addReg(M0Val);
1601
1602 Register DstReg = MI.getOperand(0).getReg();
1603 Register ValReg = MI.getOperand(3).getReg();
1605 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1606 .addReg(ValReg)
1607 .addImm(Offset)
1608 .cloneMemRefs(MI);
1609
1610 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1611 return false;
1612
1613 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1614 MI.eraseFromParent();
1615 return Ret;
1616}
1617
1618static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1619 switch (IntrID) {
1620 case Intrinsic::amdgcn_ds_gws_init:
1621 return AMDGPU::DS_GWS_INIT;
1622 case Intrinsic::amdgcn_ds_gws_barrier:
1623 return AMDGPU::DS_GWS_BARRIER;
1624 case Intrinsic::amdgcn_ds_gws_sema_v:
1625 return AMDGPU::DS_GWS_SEMA_V;
1626 case Intrinsic::amdgcn_ds_gws_sema_br:
1627 return AMDGPU::DS_GWS_SEMA_BR;
1628 case Intrinsic::amdgcn_ds_gws_sema_p:
1629 return AMDGPU::DS_GWS_SEMA_P;
1630 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1631 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1632 default:
1633 llvm_unreachable("not a gws intrinsic");
1634 }
1635}
1636
1637bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1638 Intrinsic::ID IID) const {
1639 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1640 !STI.hasGWSSemaReleaseAll()))
1641 return false;
1642
1643 // intrinsic ID, vsrc, offset
1644 const bool HasVSrc = MI.getNumOperands() == 3;
1645 assert(HasVSrc || MI.getNumOperands() == 2);
1646
1647 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1648 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1649 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1650 return false;
1651
1652 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1653 unsigned ImmOffset;
1654
1655 MachineBasicBlock *MBB = MI.getParent();
1656 const DebugLoc &DL = MI.getDebugLoc();
1657
1658 MachineInstr *Readfirstlane = nullptr;
1659
1660 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1661 // incoming offset, in case there's an add of a constant. We'll have to put it
1662 // back later.
1663 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1664 Readfirstlane = OffsetDef;
1665 BaseOffset = OffsetDef->getOperand(1).getReg();
1666 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1667 }
1668
1669 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1670 // If we have a constant offset, try to use the 0 in m0 as the base.
1671 // TODO: Look into changing the default m0 initialization value. If the
1672 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1673 // the immediate offset.
1674
1675 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1676 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1677 .addImm(0);
1678 } else {
1679 std::tie(BaseOffset, ImmOffset) =
1680 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1681
1682 if (Readfirstlane) {
1683 // We have the constant offset now, so put the readfirstlane back on the
1684 // variable component.
1685 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1686 return false;
1687
1688 Readfirstlane->getOperand(1).setReg(BaseOffset);
1689 BaseOffset = Readfirstlane->getOperand(0).getReg();
1690 } else {
1691 if (!RBI.constrainGenericRegister(BaseOffset,
1692 AMDGPU::SReg_32RegClass, *MRI))
1693 return false;
1694 }
1695
1696 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1697 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1698 .addReg(BaseOffset)
1699 .addImm(16)
1700 .setOperandDead(3); // Dead scc
1701
1702 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1703 .addReg(M0Base);
1704 }
1705
1706 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1707 // offset field) % 64. Some versions of the programming guide omit the m0
1708 // part, or claim it's from offset 0.
1709 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1710
1711 if (HasVSrc) {
1712 Register VSrc = MI.getOperand(1).getReg();
1713 MIB.addReg(VSrc);
1714
1715 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1716 return false;
1717 }
1718
1719 MIB.addImm(ImmOffset)
1720 .cloneMemRefs(MI);
1721
1722 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1723
1724 MI.eraseFromParent();
1725 return true;
1726}
1727
1728bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1729 bool IsAppend) const {
1730 Register PtrBase = MI.getOperand(2).getReg();
1731 LLT PtrTy = MRI->getType(PtrBase);
1732 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1733
1734 unsigned Offset;
1735 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1736
1737 // TODO: Should this try to look through readfirstlane like GWS?
1738 if (!isDSOffsetLegal(PtrBase, Offset)) {
1739 PtrBase = MI.getOperand(2).getReg();
1740 Offset = 0;
1741 }
1742
1743 MachineBasicBlock *MBB = MI.getParent();
1744 const DebugLoc &DL = MI.getDebugLoc();
1745 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1746
1747 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1748 .addReg(PtrBase);
1749 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1750 return false;
1751
1752 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1753 .addImm(Offset)
1754 .addImm(IsGDS ? -1 : 0)
1755 .cloneMemRefs(MI);
1756 MI.eraseFromParent();
1757 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1758}
1759
1760bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1762 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1763 if (WGSize <= STI.getWavefrontSize()) {
1764 MachineBasicBlock *MBB = MI.getParent();
1765 const DebugLoc &DL = MI.getDebugLoc();
1766 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1767 MI.eraseFromParent();
1768 return true;
1769 }
1770 }
1771
1772 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1773 if (STI.hasSplitBarriers()) {
1774 MachineBasicBlock *MBB = MI.getParent();
1775 const DebugLoc &DL = MI.getDebugLoc();
1776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1778 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1780 MI.eraseFromParent();
1781 return true;
1782 }
1783
1784 return selectImpl(MI, *CoverageInfo);
1785}
1786
1787static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1788 bool &IsTexFail) {
1789 if (TexFailCtrl)
1790 IsTexFail = true;
1791
1792 TFE = (TexFailCtrl & 0x1) ? true : false;
1793 TexFailCtrl &= ~(uint64_t)0x1;
1794 LWE = (TexFailCtrl & 0x2) ? true : false;
1795 TexFailCtrl &= ~(uint64_t)0x2;
1796
1797 return TexFailCtrl == 0;
1798}
1799
1800bool AMDGPUInstructionSelector::selectImageIntrinsic(
1802 MachineBasicBlock *MBB = MI.getParent();
1803 const DebugLoc &DL = MI.getDebugLoc();
1804
1805 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1807
1808 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1809 unsigned IntrOpcode = Intr->BaseOpcode;
1810 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1811 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1812 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1813
1814 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1815
1816 Register VDataIn, VDataOut;
1817 LLT VDataTy;
1818 int NumVDataDwords = -1;
1819 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1820 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1821
1822 bool Unorm;
1823 if (!BaseOpcode->Sampler)
1824 Unorm = true;
1825 else
1826 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1827
1828 bool TFE;
1829 bool LWE;
1830 bool IsTexFail = false;
1831 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1832 TFE, LWE, IsTexFail))
1833 return false;
1834
1835 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1836 const bool IsA16 = (Flags & 1) != 0;
1837 const bool IsG16 = (Flags & 2) != 0;
1838
1839 // A16 implies 16 bit gradients if subtarget doesn't support G16
1840 if (IsA16 && !STI.hasG16() && !IsG16)
1841 return false;
1842
1843 unsigned DMask = 0;
1844 unsigned DMaskLanes = 0;
1845
1846 if (BaseOpcode->Atomic) {
1847 VDataOut = MI.getOperand(0).getReg();
1848 VDataIn = MI.getOperand(2).getReg();
1849 LLT Ty = MRI->getType(VDataIn);
1850
1851 // Be careful to allow atomic swap on 16-bit element vectors.
1852 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1853 Ty.getSizeInBits() == 128 :
1854 Ty.getSizeInBits() == 64;
1855
1856 if (BaseOpcode->AtomicX2) {
1857 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1858
1859 DMask = Is64Bit ? 0xf : 0x3;
1860 NumVDataDwords = Is64Bit ? 4 : 2;
1861 } else {
1862 DMask = Is64Bit ? 0x3 : 0x1;
1863 NumVDataDwords = Is64Bit ? 2 : 1;
1864 }
1865 } else {
1866 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1867 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1868
1869 if (BaseOpcode->Store) {
1870 VDataIn = MI.getOperand(1).getReg();
1871 VDataTy = MRI->getType(VDataIn);
1872 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1873 } else if (BaseOpcode->NoReturn) {
1874 NumVDataDwords = 0;
1875 } else {
1876 VDataOut = MI.getOperand(0).getReg();
1877 VDataTy = MRI->getType(VDataOut);
1878 NumVDataDwords = DMaskLanes;
1879
1880 if (IsD16 && !STI.hasUnpackedD16VMem())
1881 NumVDataDwords = (DMaskLanes + 1) / 2;
1882 }
1883 }
1884
1885 // Set G16 opcode
1886 if (Subtarget->hasG16() && IsG16) {
1887 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1889 assert(G16MappingInfo);
1890 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1891 }
1892
1893 // TODO: Check this in verifier.
1894 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1895
1896 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1897 if (BaseOpcode->Atomic)
1898 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1899 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1901 return false;
1902
1903 int NumVAddrRegs = 0;
1904 int NumVAddrDwords = 0;
1905 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1906 // Skip the $noregs and 0s inserted during legalization.
1907 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1908 if (!AddrOp.isReg())
1909 continue; // XXX - Break?
1910
1911 Register Addr = AddrOp.getReg();
1912 if (!Addr)
1913 break;
1914
1915 ++NumVAddrRegs;
1916 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1917 }
1918
1919 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1920 // NSA, these should have been packed into a single value in the first
1921 // address register
1922 const bool UseNSA =
1923 NumVAddrRegs != 1 &&
1924 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1925 : NumVAddrDwords == NumVAddrRegs);
1926 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1927 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1928 return false;
1929 }
1930
1931 if (IsTexFail)
1932 ++NumVDataDwords;
1933
1934 int Opcode = -1;
1935 if (IsGFX12Plus) {
1936 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1937 NumVDataDwords, NumVAddrDwords);
1938 } else if (IsGFX11Plus) {
1939 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1940 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1941 : AMDGPU::MIMGEncGfx11Default,
1942 NumVDataDwords, NumVAddrDwords);
1943 } else if (IsGFX10Plus) {
1944 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1945 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1946 : AMDGPU::MIMGEncGfx10Default,
1947 NumVDataDwords, NumVAddrDwords);
1948 } else {
1949 if (Subtarget->hasGFX90AInsts()) {
1950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1951 NumVDataDwords, NumVAddrDwords);
1952 if (Opcode == -1) {
1953 LLVM_DEBUG(
1954 dbgs()
1955 << "requested image instruction is not supported on this GPU\n");
1956 return false;
1957 }
1958 }
1959 if (Opcode == -1 &&
1961 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1962 NumVDataDwords, NumVAddrDwords);
1963 if (Opcode == -1)
1964 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1965 NumVDataDwords, NumVAddrDwords);
1966 }
1967 if (Opcode == -1)
1968 return false;
1969
1970 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1971 .cloneMemRefs(MI);
1972
1973 if (VDataOut) {
1974 if (BaseOpcode->AtomicX2) {
1975 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1976
1977 Register TmpReg = MRI->createVirtualRegister(
1978 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1979 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1980
1981 MIB.addDef(TmpReg);
1982 if (!MRI->use_empty(VDataOut)) {
1983 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1984 .addReg(TmpReg, RegState::Kill, SubReg);
1985 }
1986
1987 } else {
1988 MIB.addDef(VDataOut); // vdata output
1989 }
1990 }
1991
1992 if (VDataIn)
1993 MIB.addReg(VDataIn); // vdata input
1994
1995 for (int I = 0; I != NumVAddrRegs; ++I) {
1996 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1997 if (SrcOp.isReg()) {
1998 assert(SrcOp.getReg() != 0);
1999 MIB.addReg(SrcOp.getReg());
2000 }
2001 }
2002
2003 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2004 if (BaseOpcode->Sampler)
2005 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2006
2007 MIB.addImm(DMask); // dmask
2008
2009 if (IsGFX10Plus)
2010 MIB.addImm(DimInfo->Encoding);
2011 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2012 MIB.addImm(Unorm);
2013
2014 MIB.addImm(CPol);
2015 MIB.addImm(IsA16 && // a16 or r128
2016 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2017 if (IsGFX10Plus)
2018 MIB.addImm(IsA16 ? -1 : 0);
2019
2020 if (!Subtarget->hasGFX90AInsts()) {
2021 MIB.addImm(TFE); // tfe
2022 } else if (TFE) {
2023 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2024 return false;
2025 }
2026
2027 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2028 MIB.addImm(LWE); // lwe
2029 if (!IsGFX10Plus)
2030 MIB.addImm(DimInfo->DA ? -1 : 0);
2031 if (BaseOpcode->HasD16)
2032 MIB.addImm(IsD16 ? -1 : 0);
2033
2034 MI.eraseFromParent();
2035 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2036 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2037 return true;
2038}
2039
2040// We need to handle this here because tablegen doesn't support matching
2041// instructions with multiple outputs.
2042bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2043 MachineInstr &MI) const {
2044 Register Dst0 = MI.getOperand(0).getReg();
2045 Register Dst1 = MI.getOperand(1).getReg();
2046
2047 const DebugLoc &DL = MI.getDebugLoc();
2048 MachineBasicBlock *MBB = MI.getParent();
2049
2050 Register Addr = MI.getOperand(3).getReg();
2051 Register Data0 = MI.getOperand(4).getReg();
2052 Register Data1 = MI.getOperand(5).getReg();
2053 unsigned Offset = MI.getOperand(6).getImm();
2054
2055 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2056 .addDef(Dst1)
2057 .addUse(Addr)
2058 .addUse(Data0)
2059 .addUse(Data1)
2060 .addImm(Offset)
2061 .cloneMemRefs(MI);
2062
2063 MI.eraseFromParent();
2064 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2065}
2066
2067bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2068 MachineInstr &I) const {
2069 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2070 switch (IntrinsicID) {
2071 case Intrinsic::amdgcn_end_cf:
2072 return selectEndCfIntrinsic(I);
2073 case Intrinsic::amdgcn_ds_ordered_add:
2074 case Intrinsic::amdgcn_ds_ordered_swap:
2075 return selectDSOrderedIntrinsic(I, IntrinsicID);
2076 case Intrinsic::amdgcn_ds_gws_init:
2077 case Intrinsic::amdgcn_ds_gws_barrier:
2078 case Intrinsic::amdgcn_ds_gws_sema_v:
2079 case Intrinsic::amdgcn_ds_gws_sema_br:
2080 case Intrinsic::amdgcn_ds_gws_sema_p:
2081 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2082 return selectDSGWSIntrinsic(I, IntrinsicID);
2083 case Intrinsic::amdgcn_ds_append:
2084 return selectDSAppendConsume(I, true);
2085 case Intrinsic::amdgcn_ds_consume:
2086 return selectDSAppendConsume(I, false);
2087 case Intrinsic::amdgcn_s_barrier:
2088 return selectSBarrier(I);
2089 case Intrinsic::amdgcn_raw_buffer_load_lds:
2090 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2091 case Intrinsic::amdgcn_struct_buffer_load_lds:
2092 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2093 return selectBufferLoadLds(I);
2094 case Intrinsic::amdgcn_global_load_lds:
2095 return selectGlobalLoadLds(I);
2096 case Intrinsic::amdgcn_exp_compr:
2097 if (!STI.hasCompressedExport()) {
2098 Function &F = I.getMF()->getFunction();
2100 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2101 F.getContext().diagnose(NoFpRet);
2102 return false;
2103 }
2104 break;
2105 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2106 return selectDSBvhStackIntrinsic(I);
2107 case Intrinsic::amdgcn_s_barrier_init:
2108 case Intrinsic::amdgcn_s_barrier_join:
2109 case Intrinsic::amdgcn_s_wakeup_barrier:
2110 case Intrinsic::amdgcn_s_get_barrier_state:
2111 return selectNamedBarrierInst(I, IntrinsicID);
2112 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2113 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2114 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2115 case Intrinsic::amdgcn_s_barrier_leave:
2116 return selectSBarrierLeave(I);
2117 }
2118 return selectImpl(I, *CoverageInfo);
2119}
2120
2121bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2122 if (selectImpl(I, *CoverageInfo))
2123 return true;
2124
2125 MachineBasicBlock *BB = I.getParent();
2126 const DebugLoc &DL = I.getDebugLoc();
2127
2128 Register DstReg = I.getOperand(0).getReg();
2129 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2130 assert(Size <= 32 || Size == 64);
2131 const MachineOperand &CCOp = I.getOperand(1);
2132 Register CCReg = CCOp.getReg();
2133 if (!isVCC(CCReg, *MRI)) {
2134 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2135 AMDGPU::S_CSELECT_B32;
2136 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2137 .addReg(CCReg);
2138
2139 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2140 // bank, because it does not cover the register class that we used to represent
2141 // for it. So we need to manually set the register class here.
2142 if (!MRI->getRegClassOrNull(CCReg))
2143 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2144 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2145 .add(I.getOperand(2))
2146 .add(I.getOperand(3));
2147
2148 bool Ret = false;
2149 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2150 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2151 I.eraseFromParent();
2152 return Ret;
2153 }
2154
2155 // Wide VGPR select should have been split in RegBankSelect.
2156 if (Size > 32)
2157 return false;
2158
2160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2161 .addImm(0)
2162 .add(I.getOperand(3))
2163 .addImm(0)
2164 .add(I.getOperand(2))
2165 .add(I.getOperand(1));
2166
2167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2168 I.eraseFromParent();
2169 return Ret;
2170}
2171
2172bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2173 Register DstReg = I.getOperand(0).getReg();
2174 Register SrcReg = I.getOperand(1).getReg();
2175 const LLT DstTy = MRI->getType(DstReg);
2176 const LLT SrcTy = MRI->getType(SrcReg);
2177 const LLT S1 = LLT::scalar(1);
2178
2179 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2180 const RegisterBank *DstRB;
2181 if (DstTy == S1) {
2182 // This is a special case. We don't treat s1 for legalization artifacts as
2183 // vcc booleans.
2184 DstRB = SrcRB;
2185 } else {
2186 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2187 if (SrcRB != DstRB)
2188 return false;
2189 }
2190
2191 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2192
2193 unsigned DstSize = DstTy.getSizeInBits();
2194 unsigned SrcSize = SrcTy.getSizeInBits();
2195
2196 const TargetRegisterClass *SrcRC =
2197 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2198 const TargetRegisterClass *DstRC =
2199 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2200 if (!SrcRC || !DstRC)
2201 return false;
2202
2203 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2204 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2205 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2206 return false;
2207 }
2208
2209 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2210 MachineBasicBlock *MBB = I.getParent();
2211 const DebugLoc &DL = I.getDebugLoc();
2212
2213 Register LoReg = MRI->createVirtualRegister(DstRC);
2214 Register HiReg = MRI->createVirtualRegister(DstRC);
2215 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2216 .addReg(SrcReg, 0, AMDGPU::sub0);
2217 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2218 .addReg(SrcReg, 0, AMDGPU::sub1);
2219
2220 if (IsVALU && STI.hasSDWA()) {
2221 // Write the low 16-bits of the high element into the high 16-bits of the
2222 // low element.
2223 MachineInstr *MovSDWA =
2224 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2225 .addImm(0) // $src0_modifiers
2226 .addReg(HiReg) // $src0
2227 .addImm(0) // $clamp
2228 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2229 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2230 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2231 .addReg(LoReg, RegState::Implicit);
2232 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2233 } else {
2234 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2235 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2236 Register ImmReg = MRI->createVirtualRegister(DstRC);
2237 if (IsVALU) {
2238 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2239 .addImm(16)
2240 .addReg(HiReg);
2241 } else {
2242 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2243 .addReg(HiReg)
2244 .addImm(16)
2245 .setOperandDead(3); // Dead scc
2246 }
2247
2248 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2249 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2250 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2251
2252 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2253 .addImm(0xffff);
2254 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2255 .addReg(LoReg)
2256 .addReg(ImmReg);
2257 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2258 .addReg(TmpReg0)
2259 .addReg(TmpReg1);
2260
2261 if (!IsVALU) {
2262 And.setOperandDead(3); // Dead scc
2263 Or.setOperandDead(3); // Dead scc
2264 }
2265 }
2266
2267 I.eraseFromParent();
2268 return true;
2269 }
2270
2271 if (!DstTy.isScalar())
2272 return false;
2273
2274 if (SrcSize > 32) {
2275 unsigned SubRegIdx =
2276 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2277 if (SubRegIdx == AMDGPU::NoSubRegister)
2278 return false;
2279
2280 // Deal with weird cases where the class only partially supports the subreg
2281 // index.
2282 const TargetRegisterClass *SrcWithSubRC
2283 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2284 if (!SrcWithSubRC)
2285 return false;
2286
2287 if (SrcWithSubRC != SrcRC) {
2288 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2289 return false;
2290 }
2291
2292 I.getOperand(1).setSubReg(SubRegIdx);
2293 }
2294
2295 I.setDesc(TII.get(TargetOpcode::COPY));
2296 return true;
2297}
2298
2299/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2300static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2301 Mask = maskTrailingOnes<unsigned>(Size);
2302 int SignedMask = static_cast<int>(Mask);
2303 return SignedMask >= -16 && SignedMask <= 64;
2304}
2305
2306// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2307const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2308 Register Reg, const MachineRegisterInfo &MRI,
2309 const TargetRegisterInfo &TRI) const {
2310 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2311 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2312 return RB;
2313
2314 // Ignore the type, since we don't use vcc in artifacts.
2315 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2316 return &RBI.getRegBankFromRegClass(*RC, LLT());
2317 return nullptr;
2318}
2319
2320bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2321 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2322 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2323 const DebugLoc &DL = I.getDebugLoc();
2324 MachineBasicBlock &MBB = *I.getParent();
2325 const Register DstReg = I.getOperand(0).getReg();
2326 const Register SrcReg = I.getOperand(1).getReg();
2327
2328 const LLT DstTy = MRI->getType(DstReg);
2329 const LLT SrcTy = MRI->getType(SrcReg);
2330 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2331 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2332 const unsigned DstSize = DstTy.getSizeInBits();
2333 if (!DstTy.isScalar())
2334 return false;
2335
2336 // Artifact casts should never use vcc.
2337 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2338
2339 // FIXME: This should probably be illegal and split earlier.
2340 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2341 if (DstSize <= 32)
2342 return selectCOPY(I);
2343
2344 const TargetRegisterClass *SrcRC =
2345 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2346 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2347 const TargetRegisterClass *DstRC =
2348 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2349
2350 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2351 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2352 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2353 .addReg(SrcReg)
2354 .addImm(AMDGPU::sub0)
2355 .addReg(UndefReg)
2356 .addImm(AMDGPU::sub1);
2357 I.eraseFromParent();
2358
2359 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2360 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2361 }
2362
2363 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2364 // 64-bit should have been split up in RegBankSelect
2365
2366 // Try to use an and with a mask if it will save code size.
2367 unsigned Mask;
2368 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2369 MachineInstr *ExtI =
2370 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2371 .addImm(Mask)
2372 .addReg(SrcReg);
2373 I.eraseFromParent();
2374 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2375 }
2376
2377 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2378 MachineInstr *ExtI =
2379 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2380 .addReg(SrcReg)
2381 .addImm(0) // Offset
2382 .addImm(SrcSize); // Width
2383 I.eraseFromParent();
2384 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2385 }
2386
2387 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2388 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2389 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2390 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2391 return false;
2392
2393 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2394 const unsigned SextOpc = SrcSize == 8 ?
2395 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2396 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2397 .addReg(SrcReg);
2398 I.eraseFromParent();
2399 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2400 }
2401
2402 // Using a single 32-bit SALU to calculate the high half is smaller than
2403 // S_BFE with a literal constant operand.
2404 if (DstSize > 32 && SrcSize == 32) {
2405 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2406 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2407 if (Signed) {
2408 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2409 .addReg(SrcReg, 0, SubReg)
2410 .addImm(31)
2411 .setOperandDead(3); // Dead scc
2412 } else {
2413 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2414 .addImm(0);
2415 }
2416 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2417 .addReg(SrcReg, 0, SubReg)
2418 .addImm(AMDGPU::sub0)
2419 .addReg(HiReg)
2420 .addImm(AMDGPU::sub1);
2421 I.eraseFromParent();
2422 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2423 *MRI);
2424 }
2425
2426 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2427 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2428
2429 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2430 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2431 // We need a 64-bit register source, but the high bits don't matter.
2432 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2433 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2434 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2435
2436 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2437 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2438 .addReg(SrcReg, 0, SubReg)
2439 .addImm(AMDGPU::sub0)
2440 .addReg(UndefReg)
2441 .addImm(AMDGPU::sub1);
2442
2443 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2444 .addReg(ExtReg)
2445 .addImm(SrcSize << 16);
2446
2447 I.eraseFromParent();
2448 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2449 }
2450
2451 unsigned Mask;
2452 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2453 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2454 .addReg(SrcReg)
2455 .addImm(Mask)
2456 .setOperandDead(3); // Dead scc
2457 } else {
2458 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2459 .addReg(SrcReg)
2460 .addImm(SrcSize << 16);
2461 }
2462
2463 I.eraseFromParent();
2464 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2465 }
2466
2467 return false;
2468}
2469
2471 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2472}
2473
2475 Register BitcastSrc;
2476 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2477 Reg = BitcastSrc;
2478 return Reg;
2479}
2480
2482 Register &Out) {
2483 Register Trunc;
2484 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2485 return false;
2486
2487 Register LShlSrc;
2488 Register Cst;
2489 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2490 Cst = stripCopy(Cst, MRI);
2491 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2492 Out = stripBitCast(LShlSrc, MRI);
2493 return true;
2494 }
2495 }
2496
2497 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2498 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2499 return false;
2500
2501 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2502 LLT::fixed_vector(2, 16));
2503
2504 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2505 assert(Mask.size() == 2);
2506
2507 if (Mask[0] == 1 && Mask[1] <= 1) {
2508 Out = Shuffle->getOperand(0).getReg();
2509 return true;
2510 }
2511
2512 return false;
2513}
2514
2515bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2516 if (!Subtarget->hasSALUFloatInsts())
2517 return false;
2518
2519 Register Dst = I.getOperand(0).getReg();
2520 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2521 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2522 return false;
2523
2524 Register Src = I.getOperand(1).getReg();
2525
2526 if (MRI->getType(Dst) == LLT::scalar(32) &&
2527 MRI->getType(Src) == LLT::scalar(16)) {
2528 if (isExtractHiElt(*MRI, Src, Src)) {
2529 MachineBasicBlock *BB = I.getParent();
2530 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2531 .addUse(Src);
2532 I.eraseFromParent();
2533 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2534 }
2535 }
2536
2537 return false;
2538}
2539
2540bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2541 // Only manually handle the f64 SGPR case.
2542 //
2543 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2544 // the bit ops theoretically have a second result due to the implicit def of
2545 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2546 // that is easy by disabling the check. The result works, but uses a
2547 // nonsensical sreg32orlds_and_sreg_1 regclass.
2548 //
2549 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2550 // the variadic REG_SEQUENCE operands.
2551
2552 Register Dst = MI.getOperand(0).getReg();
2553 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2554 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2555 MRI->getType(Dst) != LLT::scalar(64))
2556 return false;
2557
2558 Register Src = MI.getOperand(1).getReg();
2559 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2560 if (Fabs)
2561 Src = Fabs->getOperand(1).getReg();
2562
2563 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2564 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2565 return false;
2566
2567 MachineBasicBlock *BB = MI.getParent();
2568 const DebugLoc &DL = MI.getDebugLoc();
2569 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2570 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2571 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2572 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2573
2574 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2575 .addReg(Src, 0, AMDGPU::sub0);
2576 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2577 .addReg(Src, 0, AMDGPU::sub1);
2578 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2579 .addImm(0x80000000);
2580
2581 // Set or toggle sign bit.
2582 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2583 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2584 .addReg(HiReg)
2585 .addReg(ConstReg)
2586 .setOperandDead(3); // Dead scc
2587 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2588 .addReg(LoReg)
2589 .addImm(AMDGPU::sub0)
2590 .addReg(OpReg)
2591 .addImm(AMDGPU::sub1);
2592 MI.eraseFromParent();
2593 return true;
2594}
2595
2596// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2597bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2598 Register Dst = MI.getOperand(0).getReg();
2599 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2600 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2601 MRI->getType(Dst) != LLT::scalar(64))
2602 return false;
2603
2604 Register Src = MI.getOperand(1).getReg();
2605 MachineBasicBlock *BB = MI.getParent();
2606 const DebugLoc &DL = MI.getDebugLoc();
2607 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2608 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2609 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2610 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2611
2612 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2613 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2614 return false;
2615
2616 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2617 .addReg(Src, 0, AMDGPU::sub0);
2618 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2619 .addReg(Src, 0, AMDGPU::sub1);
2620 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2621 .addImm(0x7fffffff);
2622
2623 // Clear sign bit.
2624 // TODO: Should this used S_BITSET0_*?
2625 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2626 .addReg(HiReg)
2627 .addReg(ConstReg)
2628 .setOperandDead(3); // Dead scc
2629 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2630 .addReg(LoReg)
2631 .addImm(AMDGPU::sub0)
2632 .addReg(OpReg)
2633 .addImm(AMDGPU::sub1);
2634
2635 MI.eraseFromParent();
2636 return true;
2637}
2638
2639static bool isConstant(const MachineInstr &MI) {
2640 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2641}
2642
2643void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2644 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2645
2646 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2647 const MachineInstr *PtrMI =
2648 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2649
2650 assert(PtrMI);
2651
2652 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2653 return;
2654
2655 GEPInfo GEPInfo;
2656
2657 for (unsigned i = 1; i != 3; ++i) {
2658 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2659 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2660 assert(OpDef);
2661 if (i == 2 && isConstant(*OpDef)) {
2662 // TODO: Could handle constant base + variable offset, but a combine
2663 // probably should have commuted it.
2664 assert(GEPInfo.Imm == 0);
2665 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2666 continue;
2667 }
2668 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2669 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2670 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2671 else
2672 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2673 }
2674
2675 AddrInfo.push_back(GEPInfo);
2676 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2677}
2678
2679bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2680 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2681}
2682
2683bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2684 if (!MI.hasOneMemOperand())
2685 return false;
2686
2687 const MachineMemOperand *MMO = *MI.memoperands_begin();
2688 const Value *Ptr = MMO->getValue();
2689
2690 // UndefValue means this is a load of a kernel input. These are uniform.
2691 // Sometimes LDS instructions have constant pointers.
2692 // If Ptr is null, then that means this mem operand contains a
2693 // PseudoSourceValue like GOT.
2694 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2695 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2696 return true;
2697
2699 return true;
2700
2701 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2702 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2703 AMDGPU::SGPRRegBankID;
2704
2705 const Instruction *I = dyn_cast<Instruction>(Ptr);
2706 return I && I->getMetadata("amdgpu.uniform");
2707}
2708
2709bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2710 for (const GEPInfo &GEPInfo : AddrInfo) {
2711 if (!GEPInfo.VgprParts.empty())
2712 return true;
2713 }
2714 return false;
2715}
2716
2717void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2718 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2719 unsigned AS = PtrTy.getAddressSpace();
2721 STI.ldsRequiresM0Init()) {
2722 MachineBasicBlock *BB = I.getParent();
2723
2724 // If DS instructions require M0 initialization, insert it before selecting.
2725 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2726 .addImm(-1);
2727 }
2728}
2729
2730bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2731 MachineInstr &I) const {
2732 initM0(I);
2733 return selectImpl(I, *CoverageInfo);
2734}
2735
2737 if (Reg.isPhysical())
2738 return false;
2739
2740 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2741 const unsigned Opcode = MI.getOpcode();
2742
2743 if (Opcode == AMDGPU::COPY)
2744 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2745
2746 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2747 Opcode == AMDGPU::G_XOR)
2748 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2749 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2750
2751 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2752 return GI->is(Intrinsic::amdgcn_class);
2753
2754 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2755}
2756
2757bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2758 MachineBasicBlock *BB = I.getParent();
2759 MachineOperand &CondOp = I.getOperand(0);
2760 Register CondReg = CondOp.getReg();
2761 const DebugLoc &DL = I.getDebugLoc();
2762
2763 unsigned BrOpcode;
2764 Register CondPhysReg;
2765 const TargetRegisterClass *ConstrainRC;
2766
2767 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2768 // whether the branch is uniform when selecting the instruction. In
2769 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2770 // RegBankSelect knows what it's doing if the branch condition is scc, even
2771 // though it currently does not.
2772 if (!isVCC(CondReg, *MRI)) {
2773 if (MRI->getType(CondReg) != LLT::scalar(32))
2774 return false;
2775
2776 CondPhysReg = AMDGPU::SCC;
2777 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2778 ConstrainRC = &AMDGPU::SReg_32RegClass;
2779 } else {
2780 // FIXME: Should scc->vcc copies and with exec?
2781
2782 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2783 // need to insert an and with exec.
2784 if (!isVCmpResult(CondReg, *MRI)) {
2785 const bool Is64 = STI.isWave64();
2786 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2787 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2788
2789 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2790 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2791 .addReg(CondReg)
2792 .addReg(Exec)
2793 .setOperandDead(3); // Dead scc
2794 CondReg = TmpReg;
2795 }
2796
2797 CondPhysReg = TRI.getVCC();
2798 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2799 ConstrainRC = TRI.getBoolRC();
2800 }
2801
2802 if (!MRI->getRegClassOrNull(CondReg))
2803 MRI->setRegClass(CondReg, ConstrainRC);
2804
2805 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2806 .addReg(CondReg);
2807 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2808 .addMBB(I.getOperand(1).getMBB());
2809
2810 I.eraseFromParent();
2811 return true;
2812}
2813
2814bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2815 MachineInstr &I) const {
2816 Register DstReg = I.getOperand(0).getReg();
2817 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2818 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2819 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2820 if (IsVGPR)
2821 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2822
2823 return RBI.constrainGenericRegister(
2824 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2825}
2826
2827bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2828 Register DstReg = I.getOperand(0).getReg();
2829 Register SrcReg = I.getOperand(1).getReg();
2830 Register MaskReg = I.getOperand(2).getReg();
2831 LLT Ty = MRI->getType(DstReg);
2832 LLT MaskTy = MRI->getType(MaskReg);
2833 MachineBasicBlock *BB = I.getParent();
2834 const DebugLoc &DL = I.getDebugLoc();
2835
2836 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2837 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2838 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2839 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2840 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2841 return false;
2842
2843 // Try to avoid emitting a bit operation when we only need to touch half of
2844 // the 64-bit pointer.
2845 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2846 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2847 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2848
2849 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2850 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2851
2852 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2853 !CanCopyLow32 && !CanCopyHi32) {
2854 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2855 .addReg(SrcReg)
2856 .addReg(MaskReg)
2857 .setOperandDead(3); // Dead scc
2858 I.eraseFromParent();
2859 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2860 }
2861
2862 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2863 const TargetRegisterClass &RegRC
2864 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2865
2866 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2867 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2868 const TargetRegisterClass *MaskRC =
2869 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2870
2871 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2872 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2873 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2874 return false;
2875
2876 if (Ty.getSizeInBits() == 32) {
2877 assert(MaskTy.getSizeInBits() == 32 &&
2878 "ptrmask should have been narrowed during legalize");
2879
2880 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2881 .addReg(SrcReg)
2882 .addReg(MaskReg);
2883
2884 if (!IsVGPR)
2885 NewOp.setOperandDead(3); // Dead scc
2886 I.eraseFromParent();
2887 return true;
2888 }
2889
2890 Register HiReg = MRI->createVirtualRegister(&RegRC);
2891 Register LoReg = MRI->createVirtualRegister(&RegRC);
2892
2893 // Extract the subregisters from the source pointer.
2894 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2895 .addReg(SrcReg, 0, AMDGPU::sub0);
2896 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2897 .addReg(SrcReg, 0, AMDGPU::sub1);
2898
2899 Register MaskedLo, MaskedHi;
2900
2901 if (CanCopyLow32) {
2902 // If all the bits in the low half are 1, we only need a copy for it.
2903 MaskedLo = LoReg;
2904 } else {
2905 // Extract the mask subregister and apply the and.
2906 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2907 MaskedLo = MRI->createVirtualRegister(&RegRC);
2908
2909 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2910 .addReg(MaskReg, 0, AMDGPU::sub0);
2911 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2912 .addReg(LoReg)
2913 .addReg(MaskLo);
2914 }
2915
2916 if (CanCopyHi32) {
2917 // If all the bits in the high half are 1, we only need a copy for it.
2918 MaskedHi = HiReg;
2919 } else {
2920 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2921 MaskedHi = MRI->createVirtualRegister(&RegRC);
2922
2923 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2924 .addReg(MaskReg, 0, AMDGPU::sub1);
2925 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2926 .addReg(HiReg)
2927 .addReg(MaskHi);
2928 }
2929
2930 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2931 .addReg(MaskedLo)
2932 .addImm(AMDGPU::sub0)
2933 .addReg(MaskedHi)
2934 .addImm(AMDGPU::sub1);
2935 I.eraseFromParent();
2936 return true;
2937}
2938
2939/// Return the register to use for the index value, and the subregister to use
2940/// for the indirectly accessed register.
2941static std::pair<Register, unsigned>
2943 const TargetRegisterClass *SuperRC, Register IdxReg,
2944 unsigned EltSize, GISelKnownBits &KnownBits) {
2945 Register IdxBaseReg;
2946 int Offset;
2947
2948 std::tie(IdxBaseReg, Offset) =
2950 if (IdxBaseReg == AMDGPU::NoRegister) {
2951 // This will happen if the index is a known constant. This should ordinarily
2952 // be legalized out, but handle it as a register just in case.
2953 assert(Offset == 0);
2954 IdxBaseReg = IdxReg;
2955 }
2956
2957 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2958
2959 // Skip out of bounds offsets, or else we would end up using an undefined
2960 // register.
2961 if (static_cast<unsigned>(Offset) >= SubRegs.size())
2962 return std::pair(IdxReg, SubRegs[0]);
2963 return std::pair(IdxBaseReg, SubRegs[Offset]);
2964}
2965
2966bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2967 MachineInstr &MI) const {
2968 Register DstReg = MI.getOperand(0).getReg();
2969 Register SrcReg = MI.getOperand(1).getReg();
2970 Register IdxReg = MI.getOperand(2).getReg();
2971
2972 LLT DstTy = MRI->getType(DstReg);
2973 LLT SrcTy = MRI->getType(SrcReg);
2974
2975 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2976 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2977 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2978
2979 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2980 // into a waterfall loop.
2981 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2982 return false;
2983
2984 const TargetRegisterClass *SrcRC =
2985 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
2986 const TargetRegisterClass *DstRC =
2987 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
2988 if (!SrcRC || !DstRC)
2989 return false;
2990 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2991 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2992 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2993 return false;
2994
2995 MachineBasicBlock *BB = MI.getParent();
2996 const DebugLoc &DL = MI.getDebugLoc();
2997 const bool Is64 = DstTy.getSizeInBits() == 64;
2998
2999 unsigned SubReg;
3000 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3001 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3002
3003 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3004 if (DstTy.getSizeInBits() != 32 && !Is64)
3005 return false;
3006
3007 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3008 .addReg(IdxReg);
3009
3010 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3011 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3012 .addReg(SrcReg, 0, SubReg)
3013 .addReg(SrcReg, RegState::Implicit);
3014 MI.eraseFromParent();
3015 return true;
3016 }
3017
3018 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3019 return false;
3020
3021 if (!STI.useVGPRIndexMode()) {
3022 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3023 .addReg(IdxReg);
3024 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3025 .addReg(SrcReg, 0, SubReg)
3026 .addReg(SrcReg, RegState::Implicit);
3027 MI.eraseFromParent();
3028 return true;
3029 }
3030
3031 const MCInstrDesc &GPRIDXDesc =
3032 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3033 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3034 .addReg(SrcReg)
3035 .addReg(IdxReg)
3036 .addImm(SubReg);
3037
3038 MI.eraseFromParent();
3039 return true;
3040}
3041
3042// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3043bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3044 MachineInstr &MI) const {
3045 Register DstReg = MI.getOperand(0).getReg();
3046 Register VecReg = MI.getOperand(1).getReg();
3047 Register ValReg = MI.getOperand(2).getReg();
3048 Register IdxReg = MI.getOperand(3).getReg();
3049
3050 LLT VecTy = MRI->getType(DstReg);
3051 LLT ValTy = MRI->getType(ValReg);
3052 unsigned VecSize = VecTy.getSizeInBits();
3053 unsigned ValSize = ValTy.getSizeInBits();
3054
3055 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3056 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3057 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3058
3059 assert(VecTy.getElementType() == ValTy);
3060
3061 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3062 // into a waterfall loop.
3063 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3064 return false;
3065
3066 const TargetRegisterClass *VecRC =
3067 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3068 const TargetRegisterClass *ValRC =
3069 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3070
3071 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3072 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3073 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3074 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3075 return false;
3076
3077 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3078 return false;
3079
3080 unsigned SubReg;
3081 std::tie(IdxReg, SubReg) =
3082 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3083
3084 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3085 STI.useVGPRIndexMode();
3086
3087 MachineBasicBlock *BB = MI.getParent();
3088 const DebugLoc &DL = MI.getDebugLoc();
3089
3090 if (!IndexMode) {
3091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3092 .addReg(IdxReg);
3093
3094 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3095 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3096 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3097 .addReg(VecReg)
3098 .addReg(ValReg)
3099 .addImm(SubReg);
3100 MI.eraseFromParent();
3101 return true;
3102 }
3103
3104 const MCInstrDesc &GPRIDXDesc =
3105 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3106 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3107 .addReg(VecReg)
3108 .addReg(ValReg)
3109 .addReg(IdxReg)
3110 .addImm(SubReg);
3111
3112 MI.eraseFromParent();
3113 return true;
3114}
3115
3116bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3118 unsigned Opc;
3119 unsigned Size = MI.getOperand(3).getImm();
3120
3121 // The struct intrinsic variants add one additional operand over raw.
3122 const bool HasVIndex = MI.getNumOperands() == 9;
3123 Register VIndex;
3124 int OpOffset = 0;
3125 if (HasVIndex) {
3126 VIndex = MI.getOperand(4).getReg();
3127 OpOffset = 1;
3128 }
3129
3130 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3131 std::optional<ValueAndVReg> MaybeVOffset =
3133 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3134
3135 switch (Size) {
3136 default:
3137 return false;
3138 case 1:
3139 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3140 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3141 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3142 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3143 break;
3144 case 2:
3145 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3146 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3147 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3148 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3149 break;
3150 case 4:
3151 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3152 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3153 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3154 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3155 break;
3156 }
3157
3158 MachineBasicBlock *MBB = MI.getParent();
3159 const DebugLoc &DL = MI.getDebugLoc();
3160 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3161 .add(MI.getOperand(2));
3162
3163 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3164
3165 if (HasVIndex && HasVOffset) {
3166 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3167 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3168 .addReg(VIndex)
3169 .addImm(AMDGPU::sub0)
3170 .addReg(VOffset)
3171 .addImm(AMDGPU::sub1);
3172
3173 MIB.addReg(IdxReg);
3174 } else if (HasVIndex) {
3175 MIB.addReg(VIndex);
3176 } else if (HasVOffset) {
3177 MIB.addReg(VOffset);
3178 }
3179
3180 MIB.add(MI.getOperand(1)); // rsrc
3181 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3182 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3183 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3184 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3185 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3186
3187 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3188 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3189 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3190 MachinePointerInfo StorePtrI = LoadPtrI;
3191 StorePtrI.V = nullptr;
3193
3194 auto F = LoadMMO->getFlags() &
3196 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3197 Size, LoadMMO->getBaseAlign());
3198
3199 MachineMemOperand *StoreMMO =
3201 sizeof(int32_t), LoadMMO->getBaseAlign());
3202
3203 MIB.setMemRefs({LoadMMO, StoreMMO});
3204
3205 MI.eraseFromParent();
3206 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3207}
3208
3209/// Match a zero extend from a 32-bit value to 64-bits.
3211 Register ZExtSrc;
3212 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3213 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3214
3215 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3216 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3217 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3218 return Register();
3219
3220 assert(Def->getNumOperands() == 3 &&
3221 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3222 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3223 return Def->getOperand(1).getReg();
3224 }
3225
3226 return Register();
3227}
3228
3229bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3230 unsigned Opc;
3231 unsigned Size = MI.getOperand(3).getImm();
3232
3233 switch (Size) {
3234 default:
3235 return false;
3236 case 1:
3237 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3238 break;
3239 case 2:
3240 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3241 break;
3242 case 4:
3243 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3244 break;
3245 }
3246
3247 MachineBasicBlock *MBB = MI.getParent();
3248 const DebugLoc &DL = MI.getDebugLoc();
3249 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3250 .add(MI.getOperand(2));
3251
3252 Register Addr = MI.getOperand(1).getReg();
3253 Register VOffset;
3254 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3255 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3256 if (!isSGPR(Addr)) {
3257 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3258 if (isSGPR(AddrDef->Reg)) {
3259 Addr = AddrDef->Reg;
3260 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3261 Register SAddr =
3262 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3263 if (isSGPR(SAddr)) {
3264 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3265 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3266 Addr = SAddr;
3267 VOffset = Off;
3268 }
3269 }
3270 }
3271 }
3272
3273 if (isSGPR(Addr)) {
3274 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3275 if (!VOffset) {
3276 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3277 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3278 .addImm(0);
3279 }
3280 }
3281
3282 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3283 .addReg(Addr);
3284
3285 if (isSGPR(Addr))
3286 MIB.addReg(VOffset);
3287
3288 MIB.add(MI.getOperand(4)) // offset
3289 .add(MI.getOperand(5)); // cpol
3290
3291 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3292 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3293 LoadPtrI.Offset = MI.getOperand(4).getImm();
3294 MachinePointerInfo StorePtrI = LoadPtrI;
3297 auto F = LoadMMO->getFlags() &
3299 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3300 Size, LoadMMO->getBaseAlign());
3301 MachineMemOperand *StoreMMO =
3303 sizeof(int32_t), Align(4));
3304
3305 MIB.setMemRefs({LoadMMO, StoreMMO});
3306
3307 MI.eraseFromParent();
3308 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3309}
3310
3311bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3312 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3313 MI.removeOperand(1);
3314 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3315 return true;
3316}
3317
3318bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3319 unsigned Opc;
3320 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3321 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3322 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3323 break;
3324 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3325 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3326 break;
3327 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3328 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3329 break;
3330 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3331 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3332 break;
3333 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3334 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3335 break;
3336 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3337 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3338 break;
3339 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3340 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3341 break;
3342 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3343 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3344 break;
3345 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3346 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3347 break;
3348 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3349 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3350 break;
3351 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3352 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3353 break;
3354 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3355 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3356 break;
3357 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3358 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3359 break;
3360 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3361 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3362 break;
3363 default:
3364 llvm_unreachable("unhandled smfmac intrinsic");
3365 }
3366
3367 auto VDst_In = MI.getOperand(4);
3368
3369 MI.setDesc(TII.get(Opc));
3370 MI.removeOperand(4); // VDst_In
3371 MI.removeOperand(1); // Intrinsic ID
3372 MI.addOperand(VDst_In); // Readd VDst_In to the end
3373 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3374 return true;
3375}
3376
3377bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3378 Register DstReg = MI.getOperand(0).getReg();
3379 Register SrcReg = MI.getOperand(1).getReg();
3380 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3381 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3382 MachineBasicBlock *MBB = MI.getParent();
3383 const DebugLoc &DL = MI.getDebugLoc();
3384
3385 if (IsVALU) {
3386 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3387 .addImm(Subtarget->getWavefrontSizeLog2())
3388 .addReg(SrcReg);
3389 } else {
3390 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3391 .addReg(SrcReg)
3392 .addImm(Subtarget->getWavefrontSizeLog2())
3393 .setOperandDead(3); // Dead scc
3394 }
3395
3396 const TargetRegisterClass &RC =
3397 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3398 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3399 return false;
3400
3401 MI.eraseFromParent();
3402 return true;
3403}
3404
3405bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3406 Register SrcReg = MI.getOperand(0).getReg();
3407 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3408 return false;
3409
3410 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3411 Register SP =
3413 Register WaveAddr = getWaveAddress(DefMI);
3414 MachineBasicBlock *MBB = MI.getParent();
3415 const DebugLoc &DL = MI.getDebugLoc();
3416
3417 if (!WaveAddr) {
3418 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3419 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3420 .addReg(SrcReg)
3421 .addImm(Subtarget->getWavefrontSizeLog2())
3422 .setOperandDead(3); // Dead scc
3423 }
3424
3425 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3426 .addReg(WaveAddr);
3427
3428 MI.eraseFromParent();
3429 return true;
3430}
3431
3433
3434 if (!I.isPreISelOpcode()) {
3435 if (I.isCopy())
3436 return selectCOPY(I);
3437 return true;
3438 }
3439
3440 switch (I.getOpcode()) {
3441 case TargetOpcode::G_AND:
3442 case TargetOpcode::G_OR:
3443 case TargetOpcode::G_XOR:
3444 if (selectImpl(I, *CoverageInfo))
3445 return true;
3446 return selectG_AND_OR_XOR(I);
3447 case TargetOpcode::G_ADD:
3448 case TargetOpcode::G_SUB:
3449 case TargetOpcode::G_PTR_ADD:
3450 if (selectImpl(I, *CoverageInfo))
3451 return true;
3452 return selectG_ADD_SUB(I);
3453 case TargetOpcode::G_UADDO:
3454 case TargetOpcode::G_USUBO:
3455 case TargetOpcode::G_UADDE:
3456 case TargetOpcode::G_USUBE:
3457 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3458 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3459 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3460 return selectG_AMDGPU_MAD_64_32(I);
3461 case TargetOpcode::G_INTTOPTR:
3462 case TargetOpcode::G_BITCAST:
3463 case TargetOpcode::G_PTRTOINT:
3464 case TargetOpcode::G_FREEZE:
3465 return selectCOPY(I);
3466 case TargetOpcode::G_FNEG:
3467 if (selectImpl(I, *CoverageInfo))
3468 return true;
3469 return selectG_FNEG(I);
3470 case TargetOpcode::G_FABS:
3471 if (selectImpl(I, *CoverageInfo))
3472 return true;
3473 return selectG_FABS(I);
3474 case TargetOpcode::G_EXTRACT:
3475 return selectG_EXTRACT(I);
3476 case TargetOpcode::G_MERGE_VALUES:
3477 case TargetOpcode::G_CONCAT_VECTORS:
3478 return selectG_MERGE_VALUES(I);
3479 case TargetOpcode::G_UNMERGE_VALUES:
3480 return selectG_UNMERGE_VALUES(I);
3481 case TargetOpcode::G_BUILD_VECTOR:
3482 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3483 return selectG_BUILD_VECTOR(I);
3484 case TargetOpcode::G_IMPLICIT_DEF:
3485 return selectG_IMPLICIT_DEF(I);
3486 case TargetOpcode::G_INSERT:
3487 return selectG_INSERT(I);
3488 case TargetOpcode::G_INTRINSIC:
3489 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3490 return selectG_INTRINSIC(I);
3491 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3492 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3493 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3494 case TargetOpcode::G_ICMP:
3495 case TargetOpcode::G_FCMP:
3496 if (selectG_ICMP_or_FCMP(I))
3497 return true;
3498 return selectImpl(I, *CoverageInfo);
3499 case TargetOpcode::G_LOAD:
3500 case TargetOpcode::G_STORE:
3501 case TargetOpcode::G_ATOMIC_CMPXCHG:
3502 case TargetOpcode::G_ATOMICRMW_XCHG:
3503 case TargetOpcode::G_ATOMICRMW_ADD:
3504 case TargetOpcode::G_ATOMICRMW_SUB:
3505 case TargetOpcode::G_ATOMICRMW_AND:
3506 case TargetOpcode::G_ATOMICRMW_OR:
3507 case TargetOpcode::G_ATOMICRMW_XOR:
3508 case TargetOpcode::G_ATOMICRMW_MIN:
3509 case TargetOpcode::G_ATOMICRMW_MAX:
3510 case TargetOpcode::G_ATOMICRMW_UMIN:
3511 case TargetOpcode::G_ATOMICRMW_UMAX:
3512 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3513 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3514 case TargetOpcode::G_ATOMICRMW_FADD:
3515 case TargetOpcode::G_ATOMICRMW_FMIN:
3516 case TargetOpcode::G_ATOMICRMW_FMAX:
3517 return selectG_LOAD_STORE_ATOMICRMW(I);
3518 case TargetOpcode::G_SELECT:
3519 return selectG_SELECT(I);
3520 case TargetOpcode::G_TRUNC:
3521 return selectG_TRUNC(I);
3522 case TargetOpcode::G_SEXT:
3523 case TargetOpcode::G_ZEXT:
3524 case TargetOpcode::G_ANYEXT:
3525 case TargetOpcode::G_SEXT_INREG:
3526 // This is a workaround. For extension from type i1, `selectImpl()` uses
3527 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3528 // i1 can only be hold in a SGPR class.
3529 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3530 selectImpl(I, *CoverageInfo))
3531 return true;
3532 return selectG_SZA_EXT(I);
3533 case TargetOpcode::G_FPEXT:
3534 if (selectG_FPEXT(I))
3535 return true;
3536 return selectImpl(I, *CoverageInfo);
3537 case TargetOpcode::G_BRCOND:
3538 return selectG_BRCOND(I);
3539 case TargetOpcode::G_GLOBAL_VALUE:
3540 return selectG_GLOBAL_VALUE(I);
3541 case TargetOpcode::G_PTRMASK:
3542 return selectG_PTRMASK(I);
3543 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3544 return selectG_EXTRACT_VECTOR_ELT(I);
3545 case TargetOpcode::G_INSERT_VECTOR_ELT:
3546 return selectG_INSERT_VECTOR_ELT(I);
3547 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3548 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3549 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3550 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3551 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3554 assert(Intr && "not an image intrinsic with image pseudo");
3555 return selectImageIntrinsic(I, Intr);
3556 }
3557 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3558 return selectBVHIntrinsic(I);
3559 case AMDGPU::G_SBFX:
3560 case AMDGPU::G_UBFX:
3561 return selectG_SBFX_UBFX(I);
3562 case AMDGPU::G_SI_CALL:
3563 I.setDesc(TII.get(AMDGPU::SI_CALL));
3564 return true;
3565 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3566 return selectWaveAddress(I);
3567 case AMDGPU::G_STACKRESTORE:
3568 return selectStackRestore(I);
3569 case AMDGPU::G_PHI:
3570 return selectPHI(I);
3571 case TargetOpcode::G_CONSTANT:
3572 case TargetOpcode::G_FCONSTANT:
3573 default:
3574 return selectImpl(I, *CoverageInfo);
3575 }
3576 return false;
3577}
3578
3580AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3581 return {{
3582 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3583 }};
3584
3585}
3586
3587std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
3588 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
3589 unsigned Mods = 0;
3590 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3591
3592 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3593 Src = MI->getOperand(1).getReg();
3594 Mods |= SISrcMods::NEG;
3595 MI = getDefIgnoringCopies(Src, *MRI);
3596 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3597 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3598 // denormal mode, but we're implicitly canonicalizing in a source operand.
3599 const ConstantFP *LHS =
3600 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3601 if (LHS && LHS->isZero()) {
3602 Mods |= SISrcMods::NEG;
3603 Src = MI->getOperand(2).getReg();
3604 }
3605 }
3606
3607 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3608 Src = MI->getOperand(1).getReg();
3609 Mods |= SISrcMods::ABS;
3610 }
3611
3612 if (OpSel)
3613 Mods |= SISrcMods::OP_SEL_0;
3614
3615 return std::pair(Src, Mods);
3616}
3617
3618Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3619 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3620 bool ForceVGPR) const {
3621 if ((Mods != 0 || ForceVGPR) &&
3622 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3623
3624 // If we looked through copies to find source modifiers on an SGPR operand,
3625 // we now have an SGPR register source. To avoid potentially violating the
3626 // constant bus restriction, we need to insert a copy to a VGPR.
3627 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3628 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3629 TII.get(AMDGPU::COPY), VGPRSrc)
3630 .addReg(Src);
3631 Src = VGPRSrc;
3632 }
3633
3634 return Src;
3635}
3636
3637///
3638/// This will select either an SGPR or VGPR operand and will save us from
3639/// having to write an extra tablegen pattern.
3641AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3642 return {{
3643 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3644 }};
3645}
3646
3648AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3649 Register Src;
3650 unsigned Mods;
3651 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
3652
3653 return {{
3654 [=](MachineInstrBuilder &MIB) {
3655 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3656 },
3657 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3658 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3659 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3660 }};
3661}
3662
3664AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3665 Register Src;
3666 unsigned Mods;
3667 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
3668 /*IsCanonicalizing=*/true,
3669 /*AllowAbs=*/false);
3670
3671 return {{
3672 [=](MachineInstrBuilder &MIB) {
3673 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3674 },
3675 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3676 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3677 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3678 }};
3679}
3680
3682AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3683 return {{
3684 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3685 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3686 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3687 }};
3688}
3689
3691AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3692 Register Src;
3693 unsigned Mods;
3694 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
3695
3696 return {{
3697 [=](MachineInstrBuilder &MIB) {
3698 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3699 },
3700 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3701 }};
3702}
3703
3705AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3706 MachineOperand &Root) const {
3707 Register Src;
3708 unsigned Mods;
3709 std::tie(Src, Mods) =
3710 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
3711
3712 return {{
3713 [=](MachineInstrBuilder &MIB) {
3714 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3715 },
3716 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3717 }};
3718}
3719
3721AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3722 Register Src;
3723 unsigned Mods;
3724 std::tie(Src, Mods) =
3725 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
3726 /*AllowAbs=*/false);
3727
3728 return {{
3729 [=](MachineInstrBuilder &MIB) {
3730 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3731 },
3732 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3733 }};
3734}
3735
3737AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3738 Register Reg = Root.getReg();
3739 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3740 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3741 return {};
3742 return {{
3743 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3744 }};
3745}
3746
3747std::pair<Register, unsigned>
3748AMDGPUInstructionSelector::selectVOP3PModsImpl(
3749 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3750 unsigned Mods = 0;
3751 MachineInstr *MI = MRI.getVRegDef(Src);
3752
3753 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3754 // It's possible to see an f32 fneg here, but unlikely.
3755 // TODO: Treat f32 fneg as only high bit.
3756 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3758 Src = MI->getOperand(1).getReg();
3759 MI = MRI.getVRegDef(Src);
3760 }
3761
3762 // TODO: Handle G_FSUB 0 as fneg
3763
3764 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3765 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3766
3767 // Packed instructions do not have abs modifiers.
3768 Mods |= SISrcMods::OP_SEL_1;
3769
3770 return std::pair(Src, Mods);
3771}
3772
3774AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3776 = Root.getParent()->getParent()->getParent()->getRegInfo();
3777
3778 Register Src;
3779 unsigned Mods;
3780 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3781
3782 return {{
3783 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3784 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3785 }};
3786}
3787
3789AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3791 = Root.getParent()->getParent()->getParent()->getRegInfo();
3792
3793 Register Src;
3794 unsigned Mods;
3795 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3796
3797 return {{
3798 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3799 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3800 }};
3801}
3802
3804AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3805 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3806 // Value is in Imm operand as i1 sign extended to int64_t.
3807 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3808 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3809 "expected i1 value");
3810 unsigned Mods = SISrcMods::OP_SEL_1;
3811 if (Root.getImm() == -1)
3812 Mods ^= SISrcMods::NEG;
3813 return {{
3814 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3815 }};
3816}
3817
3819AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3820 MachineOperand &Root) const {
3821 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3822 "expected i1 value");
3823 unsigned Mods = SISrcMods::OP_SEL_1;
3824 if (Root.getImm() != 0)
3825 Mods |= SISrcMods::OP_SEL_0;
3826
3827 return {{
3828 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3829 }};
3830}
3831
3833 MachineInstr *InsertPt,
3835 const TargetRegisterClass *DstRegClass;
3836 switch (Elts.size()) {
3837 case 8:
3838 DstRegClass = &AMDGPU::VReg_256RegClass;
3839 break;
3840 case 4:
3841 DstRegClass = &AMDGPU::VReg_128RegClass;
3842 break;
3843 case 2:
3844 DstRegClass = &AMDGPU::VReg_64RegClass;
3845 break;
3846 default:
3847 llvm_unreachable("unhandled Reg sequence size");
3848 }
3849
3850 MachineIRBuilder B(*InsertPt);
3851 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3852 .addDef(MRI.createVirtualRegister(DstRegClass));
3853 for (unsigned i = 0; i < Elts.size(); ++i) {
3854 MIB.addReg(Elts[i]);
3856 }
3857 return MIB->getOperand(0).getReg();
3858}
3859
3860static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3862 MachineInstr *InsertPt,
3864 if (ModOpcode == TargetOpcode::G_FNEG) {
3865 Mods |= SISrcMods::NEG;
3866 // Check if all elements also have abs modifier
3867 SmallVector<Register, 8> NegAbsElts;
3868 for (auto El : Elts) {
3869 Register FabsSrc;
3870 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3871 break;
3872 NegAbsElts.push_back(FabsSrc);
3873 }
3874 if (Elts.size() != NegAbsElts.size()) {
3875 // Neg
3876 Src = buildRegSequence(Elts, InsertPt, MRI);
3877 } else {
3878 // Neg and Abs
3879 Mods |= SISrcMods::NEG_HI;
3880 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3881 }
3882 } else {
3883 assert(ModOpcode == TargetOpcode::G_FABS);
3884 // Abs
3885 Mods |= SISrcMods::NEG_HI;
3886 Src = buildRegSequence(Elts, InsertPt, MRI);
3887 }
3888}
3889
3891AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3892 Register Src = Root.getReg();
3893 unsigned Mods = SISrcMods::OP_SEL_1;
3895
3896 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3897 assert(BV->getNumSources() > 0);
3898 // Based on first element decide which mod we match, neg or abs
3899 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3900 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3901 ? AMDGPU::G_FNEG
3902 : AMDGPU::G_FABS;
3903 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3904 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3905 if (ElF32->getOpcode() != ModOpcode)
3906 break;
3907 EltsF32.push_back(ElF32->getOperand(1).getReg());
3908 }
3909
3910 // All elements had ModOpcode modifier
3911 if (BV->getNumSources() == EltsF32.size()) {
3912 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
3913 *MRI);
3914 }
3915 }
3916
3917 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3918 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3919}
3920
3922AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
3923 Register Src = Root.getReg();
3924 unsigned Mods = SISrcMods::OP_SEL_1;
3925 SmallVector<Register, 8> EltsV2F16;
3926
3927 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3928 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
3929 Register FNegSrc;
3930 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
3931 break;
3932 EltsV2F16.push_back(FNegSrc);
3933 }
3934
3935 // All elements had ModOpcode modifier
3936 if (CV->getNumSources() == EltsV2F16.size()) {
3937 Mods |= SISrcMods::NEG;
3938 Mods |= SISrcMods::NEG_HI;
3939 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
3940 }
3941 }
3942
3943 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3944 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3945}
3946
3948AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
3949 Register Src = Root.getReg();
3950 unsigned Mods = SISrcMods::OP_SEL_1;
3951 SmallVector<Register, 8> EltsV2F16;
3952
3953 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3954 assert(CV->getNumSources() > 0);
3955 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
3956 // Based on first element decide which mod we match, neg or abs
3957 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
3958 ? AMDGPU::G_FNEG
3959 : AMDGPU::G_FABS;
3960
3961 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
3962 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
3963 if (ElV2F16->getOpcode() != ModOpcode)
3964 break;
3965 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
3966 }
3967
3968 // All elements had ModOpcode modifier
3969 if (CV->getNumSources() == EltsV2F16.size()) {
3970 MachineIRBuilder B(*Root.getParent());
3971 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
3972 *MRI);
3973 }
3974 }
3975
3976 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3977 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3978}
3979
3981AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
3982 std::optional<FPValueAndVReg> FPValReg;
3983 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
3984 if (TII.isInlineConstant(FPValReg->Value)) {
3985 return {{[=](MachineInstrBuilder &MIB) {
3986 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
3987 }}};
3988 }
3989 // Non-inlineable splat floats should not fall-through for integer immediate
3990 // checks.
3991 return {};
3992 }
3993
3994 APInt ICst;
3995 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
3996 if (TII.isInlineConstant(ICst)) {
3997 return {
3998 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
3999 }
4000 }
4001
4002 return {};
4003}
4004
4006AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4007 Register Src =
4008 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4009 unsigned Key = 0;
4010
4011 Register ShiftSrc;
4012 std::optional<ValueAndVReg> ShiftAmt;
4013 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4014 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4015 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4016 Key = ShiftAmt->Value.getZExtValue() / 8;
4017 Src = ShiftSrc;
4018 }
4019
4020 return {{
4021 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4022 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4023 }};
4024}
4025
4027AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4028
4029 Register Src =
4030 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4031 unsigned Key = 0;
4032
4033 Register ShiftSrc;
4034 std::optional<ValueAndVReg> ShiftAmt;
4035 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4036 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4037 ShiftAmt->Value.getZExtValue() == 16) {
4038 Src = ShiftSrc;
4039 Key = 1;
4040 }
4041
4042 return {{
4043 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4044 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4045 }};
4046}
4047
4049AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4050 Register Src;
4051 unsigned Mods;
4052 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4053
4054 // FIXME: Handle op_sel
4055 return {{
4056 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4057 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4058 }};
4059}
4060
4062AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4063 Register Src;
4064 unsigned Mods;
4065 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4066 /*IsCanonicalizing=*/true,
4067 /*AllowAbs=*/false,
4068 /*OpSel=*/false);
4069
4070 return {{
4071 [=](MachineInstrBuilder &MIB) {
4072 MIB.addReg(
4073 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4074 },
4075 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4076 }};
4077}
4078
4080AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4081 Register Src;
4082 unsigned Mods;
4083 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4084 /*IsCanonicalizing=*/true,
4085 /*AllowAbs=*/false,
4086 /*OpSel=*/true);
4087
4088 return {{
4089 [=](MachineInstrBuilder &MIB) {
4090 MIB.addReg(
4091 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4092 },
4093 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4094 }};
4095}
4096
4097bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4098 Register &Base,
4099 Register *SOffset,
4100 int64_t *Offset) const {
4101 MachineInstr *MI = Root.getParent();
4102 MachineBasicBlock *MBB = MI->getParent();
4103
4104 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4105 // then we can select all ptr + 32-bit offsets.
4106 SmallVector<GEPInfo, 4> AddrInfo;
4107 getAddrModeInfo(*MI, *MRI, AddrInfo);
4108
4109 if (AddrInfo.empty())
4110 return false;
4111
4112 const GEPInfo &GEPI = AddrInfo[0];
4113 std::optional<int64_t> EncodedImm;
4114
4115 if (SOffset && Offset) {
4116 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4117 /*HasSOffset=*/true);
4118 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4119 AddrInfo.size() > 1) {
4120 const GEPInfo &GEPI2 = AddrInfo[1];
4121 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4122 if (Register OffsetReg =
4123 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4124 Base = GEPI2.SgprParts[0];
4125 *SOffset = OffsetReg;
4126 *Offset = *EncodedImm;
4127 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4128 return true;
4129
4130 // For unbuffered smem loads, it is illegal for the Immediate Offset
4131 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4132 // is negative. Handle the case where the Immediate Offset + SOffset
4133 // is negative.
4134 auto SKnown = KB->getKnownBits(*SOffset);
4135 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4136 return false;
4137
4138 return true;
4139 }
4140 }
4141 }
4142 return false;
4143 }
4144
4145 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4146 /*HasSOffset=*/false);
4147 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4148 Base = GEPI.SgprParts[0];
4149 *Offset = *EncodedImm;
4150 return true;
4151 }
4152
4153 // SGPR offset is unsigned.
4154 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4155 GEPI.Imm != 0) {
4156 // If we make it this far we have a load with an 32-bit immediate offset.
4157 // It is OK to select this using a sgpr offset, because we have already
4158 // failed trying to select this load into one of the _IMM variants since
4159 // the _IMM Patterns are considered before the _SGPR patterns.
4160 Base = GEPI.SgprParts[0];
4161 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4162 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4163 .addImm(GEPI.Imm);
4164 return true;
4165 }
4166
4167 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4168 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4169 Base = GEPI.SgprParts[0];
4170 *SOffset = OffsetReg;
4171 return true;
4172 }
4173 }
4174
4175 return false;
4176}
4177
4179AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4180 Register Base;
4181 int64_t Offset;
4182 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4183 return std::nullopt;
4184
4185 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4186 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4187}
4188
4190AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4191 SmallVector<GEPInfo, 4> AddrInfo;
4192 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4193
4194 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4195 return std::nullopt;
4196
4197 const GEPInfo &GEPInfo = AddrInfo[0];
4198 Register PtrReg = GEPInfo.SgprParts[0];
4199 std::optional<int64_t> EncodedImm =
4200 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4201 if (!EncodedImm)
4202 return std::nullopt;
4203
4204 return {{
4205 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4206 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4207 }};
4208}
4209
4211AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4212 Register Base, SOffset;
4213 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4214 return std::nullopt;
4215
4216 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4217 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4218}
4219
4221AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4222 Register Base, SOffset;
4223 int64_t Offset;
4224 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4225 return std::nullopt;
4226
4227 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4228 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4229 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4230}
4231
4232std::pair<Register, int>
4233AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4234 uint64_t FlatVariant) const {
4235 MachineInstr *MI = Root.getParent();
4236
4237 auto Default = std::pair(Root.getReg(), 0);
4238
4239 if (!STI.hasFlatInstOffsets())
4240 return Default;
4241
4242 Register PtrBase;
4243 int64_t ConstOffset;
4244 std::tie(PtrBase, ConstOffset) =
4245 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4246
4247 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4248 !isFlatScratchBaseLegal(Root.getReg())))
4249 return Default;
4250
4251 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4252 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4253 return Default;
4254
4255 return std::pair(PtrBase, ConstOffset);
4256}
4257
4259AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4260 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4261
4262 return {{
4263 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4264 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4265 }};
4266}
4267
4269AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4270 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4271
4272 return {{
4273 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4274 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4275 }};
4276}
4277
4279AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4280 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4281
4282 return {{
4283 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4284 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4285 }};
4286}
4287
4288// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4290AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4291 Register Addr = Root.getReg();
4292 Register PtrBase;
4293 int64_t ConstOffset;
4294 int64_t ImmOffset = 0;
4295
4296 // Match the immediate offset first, which canonically is moved as low as
4297 // possible.
4298 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4299
4300 if (ConstOffset != 0) {
4301 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4303 Addr = PtrBase;
4304 ImmOffset = ConstOffset;
4305 } else {
4306 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4307 if (isSGPR(PtrBaseDef->Reg)) {
4308 if (ConstOffset > 0) {
4309 // Offset is too large.
4310 //
4311 // saddr + large_offset -> saddr +
4312 // (voffset = large_offset & ~MaxOffset) +
4313 // (large_offset & MaxOffset);
4314 int64_t SplitImmOffset, RemainderOffset;
4315 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4317
4318 if (isUInt<32>(RemainderOffset)) {
4319 MachineInstr *MI = Root.getParent();
4320 MachineBasicBlock *MBB = MI->getParent();
4321 Register HighBits =
4322 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4323
4324 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4325 HighBits)
4326 .addImm(RemainderOffset);
4327
4328 return {{
4329 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4330 [=](MachineInstrBuilder &MIB) {
4331 MIB.addReg(HighBits);
4332 }, // voffset
4333 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4334 }};
4335 }
4336 }
4337
4338 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4339 // is 1 we would need to perform 1 or 2 extra moves for each half of
4340 // the constant and it is better to do a scalar add and then issue a
4341 // single VALU instruction to materialize zero. Otherwise it is less
4342 // instructions to perform VALU adds with immediates or inline literals.
4343 unsigned NumLiterals =
4344 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4345 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4346 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4347 return std::nullopt;
4348 }
4349 }
4350 }
4351
4352 // Match the variable offset.
4353 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4354 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4355 // Look through the SGPR->VGPR copy.
4356 Register SAddr =
4357 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4358
4359 if (isSGPR(SAddr)) {
4360 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4361
4362 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4363 // inserted later.
4364 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4365 return {{[=](MachineInstrBuilder &MIB) { // saddr
4366 MIB.addReg(SAddr);
4367 },
4368 [=](MachineInstrBuilder &MIB) { // voffset
4369 MIB.addReg(VOffset);
4370 },
4371 [=](MachineInstrBuilder &MIB) { // offset
4372 MIB.addImm(ImmOffset);
4373 }}};
4374 }
4375 }
4376 }
4377
4378 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4379 // drop this.
4380 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4381 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4382 return std::nullopt;
4383
4384 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4385 // moves required to copy a 64-bit SGPR to VGPR.
4386 MachineInstr *MI = Root.getParent();
4387 MachineBasicBlock *MBB = MI->getParent();
4388 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4389
4390 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4391 .addImm(0);
4392
4393 return {{
4394 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4395 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4396 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4397 }};
4398}
4399
4401AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4402 Register Addr = Root.getReg();
4403 Register PtrBase;
4404 int64_t ConstOffset;
4405 int64_t ImmOffset = 0;
4406
4407 // Match the immediate offset first, which canonically is moved as low as
4408 // possible.
4409 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4410
4411 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4414 Addr = PtrBase;
4415 ImmOffset = ConstOffset;
4416 }
4417
4418 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4419 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4420 int FI = AddrDef->MI->getOperand(1).getIndex();
4421 return {{
4422 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4423 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4424 }};
4425 }
4426
4427 Register SAddr = AddrDef->Reg;
4428
4429 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4430 Register LHS = AddrDef->MI->getOperand(1).getReg();
4431 Register RHS = AddrDef->MI->getOperand(2).getReg();
4432 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4433 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4434
4435 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4436 isSGPR(RHSDef->Reg)) {
4437 int FI = LHSDef->MI->getOperand(1).getIndex();
4438 MachineInstr &I = *Root.getParent();
4439 MachineBasicBlock *BB = I.getParent();
4440 const DebugLoc &DL = I.getDebugLoc();
4441 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4442
4443 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4444 .addFrameIndex(FI)
4445 .addReg(RHSDef->Reg)
4446 .setOperandDead(3); // Dead scc
4447 }
4448 }
4449
4450 if (!isSGPR(SAddr))
4451 return std::nullopt;
4452
4453 return {{
4454 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4455 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4456 }};
4457}
4458
4459// Check whether the flat scratch SVS swizzle bug affects this access.
4460bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4461 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4462 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4463 return false;
4464
4465 // The bug affects the swizzling of SVS accesses if there is any carry out
4466 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4467 // voffset to (soffset + inst_offset).
4468 auto VKnown = KB->getKnownBits(VAddr);
4469 auto SKnown = KnownBits::computeForAddSub(
4470 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4471 KnownBits::makeConstant(APInt(32, ImmOffset)));
4472 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4473 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4474 return (VMax & 3) + (SMax & 3) >= 4;
4475}
4476
4478AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4479 Register Addr = Root.getReg();
4480 Register PtrBase;
4481 int64_t ConstOffset;
4482 int64_t ImmOffset = 0;
4483
4484 // Match the immediate offset first, which canonically is moved as low as
4485 // possible.
4486 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4487
4488 Register OrigAddr = Addr;
4489 if (ConstOffset != 0 &&
4490 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4491 Addr = PtrBase;
4492 ImmOffset = ConstOffset;
4493 }
4494
4495 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4496 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4497 return std::nullopt;
4498
4499 Register RHS = AddrDef->MI->getOperand(2).getReg();
4500 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4501 return std::nullopt;
4502
4503 Register LHS = AddrDef->MI->getOperand(1).getReg();
4504 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4505
4506 if (OrigAddr != Addr) {
4507 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4508 return std::nullopt;
4509 } else {
4510 if (!isFlatScratchBaseLegalSV(OrigAddr))
4511 return std::nullopt;
4512 }
4513
4514 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4515 return std::nullopt;
4516
4517 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4518 int FI = LHSDef->MI->getOperand(1).getIndex();
4519 return {{
4520 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4521 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4522 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4523 }};
4524 }
4525
4526 if (!isSGPR(LHS))
4527 return std::nullopt;
4528
4529 return {{
4530 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4531 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4532 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4533 }};
4534}
4535
4537AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4538 MachineInstr *MI = Root.getParent();
4539 MachineBasicBlock *MBB = MI->getParent();
4542
4543 int64_t Offset = 0;
4544 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4546 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4547
4548 // TODO: Should this be inside the render function? The iterator seems to
4549 // move.
4550 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4551 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4552 HighBits)
4553 .addImm(Offset & ~MaxOffset);
4554
4555 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4556 MIB.addReg(Info->getScratchRSrcReg());
4557 },
4558 [=](MachineInstrBuilder &MIB) { // vaddr
4559 MIB.addReg(HighBits);
4560 },
4561 [=](MachineInstrBuilder &MIB) { // soffset
4562 // Use constant zero for soffset and rely on eliminateFrameIndex
4563 // to choose the appropriate frame register if need be.
4564 MIB.addImm(0);
4565 },
4566 [=](MachineInstrBuilder &MIB) { // offset
4567 MIB.addImm(Offset & MaxOffset);
4568 }}};
4569 }
4570
4571 assert(Offset == 0 || Offset == -1);
4572
4573 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4574 // offsets.
4575 std::optional<int> FI;
4576 Register VAddr = Root.getReg();
4577 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4578 Register PtrBase;
4579 int64_t ConstOffset;
4580 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4581 if (ConstOffset != 0) {
4582 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4584 KB->signBitIsZero(PtrBase))) {
4585 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4586 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4587 FI = PtrBaseDef->getOperand(1).getIndex();
4588 else
4589 VAddr = PtrBase;
4590 Offset = ConstOffset;
4591 }
4592 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4593 FI = RootDef->getOperand(1).getIndex();
4594 }
4595 }
4596
4597 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4598 MIB.addReg(Info->getScratchRSrcReg());
4599 },
4600 [=](MachineInstrBuilder &MIB) { // vaddr
4601 if (FI)
4602 MIB.addFrameIndex(*FI);
4603 else
4604 MIB.addReg(VAddr);
4605 },
4606 [=](MachineInstrBuilder &MIB) { // soffset
4607 // Use constant zero for soffset and rely on eliminateFrameIndex
4608 // to choose the appropriate frame register if need be.
4609 MIB.addImm(0);
4610 },
4611 [=](MachineInstrBuilder &MIB) { // offset
4612 MIB.addImm(Offset);
4613 }}};
4614}
4615
4616bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4617 int64_t Offset) const {
4618 if (!isUInt<16>(Offset))
4619 return false;
4620
4622 return true;
4623
4624 // On Southern Islands instruction with a negative base value and an offset
4625 // don't seem to work.
4626 return KB->signBitIsZero(Base);
4627}
4628
4629bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4630 int64_t Offset1,
4631 unsigned Size) const {
4632 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4633 return false;
4634 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4635 return false;
4636
4638 return true;
4639
4640 // On Southern Islands instruction with a negative base value and an offset
4641 // don't seem to work.
4642 return KB->signBitIsZero(Base);
4643}
4644
4645// Return whether the operation has NoUnsignedWrap property.
4647 return Addr->getOpcode() == TargetOpcode::G_OR ||
4648 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4649 Addr->getFlag(MachineInstr::NoUWrap));
4650}
4651
4652// Check that the base address of flat scratch load/store in the form of `base +
4653// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4654// requirement). We always treat the first operand as the base address here.
4655bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4656 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4657
4658 if (isNoUnsignedWrap(AddrMI))
4659 return true;
4660
4661 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4662 // values.
4663 if (STI.hasSignedScratchOffsets())