LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
246 return false;
247
248 Register DstReg = I.getOperand(0).getReg();
249 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
250
251 I.eraseFromParent();
252 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
253}
254
255bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
256 const DebugLoc &DL = I.getDebugLoc();
257 MachineBasicBlock *BB = I.getParent();
258
259 Register DstReg = I.getOperand(0).getReg();
260 Register SrcReg = I.getOperand(1).getReg();
261 std::optional<ValueAndVReg> Arg =
262 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
263
264 if (Arg) {
265 const int64_t Value = Arg->Value.getZExtValue();
266 if (Value == 0) {
267 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
268 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
269 } else {
270 assert(Value == 1);
271 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
272 }
273 I.eraseFromParent();
274 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
275 }
276
277 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
278 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
279
280 unsigned SelectOpcode =
281 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
282 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
283 .addReg(TRI.getExec())
284 .addImm(0);
285
286 I.eraseFromParent();
287 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302}
303
304bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
305 const Register DefReg = I.getOperand(0).getReg();
306 const LLT DefTy = MRI->getType(DefReg);
307
308 // S1 G_PHIs should not be selected in instruction-select, instead:
309 // - divergent S1 G_PHI should go through lane mask merging algorithm
310 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
311 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
312 if (DefTy == LLT::scalar(1))
313 return false;
314
315 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
316
317 const RegClassOrRegBank &RegClassOrBank =
318 MRI->getRegClassOrRegBank(DefReg);
319
320 const TargetRegisterClass *DefRC =
322 if (!DefRC) {
323 if (!DefTy.isValid()) {
324 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
325 return false;
326 }
327
328 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
329 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
330 if (!DefRC) {
331 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
332 return false;
333 }
334 }
335
336 // If inputs have register bank, assign corresponding reg class.
337 // Note: registers don't need to have the same reg bank.
338 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
339 const Register SrcReg = I.getOperand(i).getReg();
340
341 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
342 if (RB) {
343 const LLT SrcTy = MRI->getType(SrcReg);
344 const TargetRegisterClass *SrcRC =
345 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
346 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
347 return false;
348 }
349 }
350
351 I.setDesc(TII.get(TargetOpcode::PHI));
352 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
353}
354
356AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
357 const TargetRegisterClass &SubRC,
358 unsigned SubIdx) const {
359
360 MachineInstr *MI = MO.getParent();
361 MachineBasicBlock *BB = MO.getParent()->getParent();
362 Register DstReg = MRI->createVirtualRegister(&SubRC);
363
364 if (MO.isReg()) {
365 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
366 Register Reg = MO.getReg();
367 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
368 .addReg(Reg, 0, ComposedSubIdx);
369
370 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
371 MO.isKill(), MO.isDead(), MO.isUndef(),
372 MO.isEarlyClobber(), 0, MO.isDebug(),
373 MO.isInternalRead());
374 }
375
376 assert(MO.isImm());
377
378 APInt Imm(64, MO.getImm());
379
380 switch (SubIdx) {
381 default:
382 llvm_unreachable("do not know to split immediate with this sub index.");
383 case AMDGPU::sub0:
384 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
385 case AMDGPU::sub1:
386 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
387 }
388}
389
390static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
391 switch (Opc) {
392 case AMDGPU::G_AND:
393 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
394 case AMDGPU::G_OR:
395 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
396 case AMDGPU::G_XOR:
397 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
398 default:
399 llvm_unreachable("not a bit op");
400 }
401}
402
403bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
404 Register DstReg = I.getOperand(0).getReg();
405 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
406
407 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
408 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
409 DstRB->getID() != AMDGPU::VCCRegBankID)
410 return false;
411
412 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
413 STI.isWave64());
414 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
415
416 // Dead implicit-def of scc
417 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
418 true, // isImp
419 false, // isKill
420 true)); // isDead
421 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
422}
423
424bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
425 MachineBasicBlock *BB = I.getParent();
426 MachineFunction *MF = BB->getParent();
427 Register DstReg = I.getOperand(0).getReg();
428 const DebugLoc &DL = I.getDebugLoc();
429 LLT Ty = MRI->getType(DstReg);
430 if (Ty.isVector())
431 return false;
432
433 unsigned Size = Ty.getSizeInBits();
434 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
435 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
436 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
437
438 if (Size == 32) {
439 if (IsSALU) {
440 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
441 MachineInstr *Add =
442 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
443 .add(I.getOperand(1))
444 .add(I.getOperand(2))
445 .setOperandDead(3); // Dead scc
446 I.eraseFromParent();
447 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
448 }
449
450 if (STI.hasAddNoCarry()) {
451 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
452 I.setDesc(TII.get(Opc));
453 I.addOperand(*MF, MachineOperand::CreateImm(0));
454 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
456 }
457
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
459
460 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
461 MachineInstr *Add
462 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
463 .addDef(UnusedCarry, RegState::Dead)
464 .add(I.getOperand(1))
465 .add(I.getOperand(2))
466 .addImm(0);
467 I.eraseFromParent();
468 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
469 }
470
471 assert(!Sub && "illegal sub should not reach here");
472
473 const TargetRegisterClass &RC
474 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
475 const TargetRegisterClass &HalfRC
476 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
477
478 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
479 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
480 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
481 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
482
483 Register DstLo = MRI->createVirtualRegister(&HalfRC);
484 Register DstHi = MRI->createVirtualRegister(&HalfRC);
485
486 if (IsSALU) {
487 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
488 .add(Lo1)
489 .add(Lo2);
490 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
491 .add(Hi1)
492 .add(Hi2)
493 .setOperandDead(3); // Dead scc
494 } else {
495 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
496 Register CarryReg = MRI->createVirtualRegister(CarryRC);
497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
498 .addDef(CarryReg)
499 .add(Lo1)
500 .add(Lo2)
501 .addImm(0);
502 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
503 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
504 .add(Hi1)
505 .add(Hi2)
506 .addReg(CarryReg, RegState::Kill)
507 .addImm(0);
508
509 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
510 return false;
511 }
512
513 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
514 .addReg(DstLo)
515 .addImm(AMDGPU::sub0)
516 .addReg(DstHi)
517 .addImm(AMDGPU::sub1);
518
519
520 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
521 return false;
522
523 I.eraseFromParent();
524 return true;
525}
526
527bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
528 MachineInstr &I) const {
529 MachineBasicBlock *BB = I.getParent();
530 MachineFunction *MF = BB->getParent();
531 const DebugLoc &DL = I.getDebugLoc();
532 Register Dst0Reg = I.getOperand(0).getReg();
533 Register Dst1Reg = I.getOperand(1).getReg();
534 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
535 I.getOpcode() == AMDGPU::G_UADDE;
536 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
537 I.getOpcode() == AMDGPU::G_USUBE;
538
539 if (isVCC(Dst1Reg, *MRI)) {
540 unsigned NoCarryOpc =
541 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
542 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
543 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
544 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
545 I.addOperand(*MF, MachineOperand::CreateImm(0));
546 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
547 }
548
549 Register Src0Reg = I.getOperand(2).getReg();
550 Register Src1Reg = I.getOperand(3).getReg();
551
552 if (HasCarryIn) {
553 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
554 .addReg(I.getOperand(4).getReg());
555 }
556
557 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
558 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
559
560 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
561 .add(I.getOperand(2))
562 .add(I.getOperand(3));
563
564 if (MRI->use_nodbg_empty(Dst1Reg)) {
565 CarryInst.setOperandDead(3); // Dead scc
566 } else {
567 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
568 .addReg(AMDGPU::SCC);
569 if (!MRI->getRegClassOrNull(Dst1Reg))
570 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
571 }
572
573 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
574 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
575 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
576 return false;
577
578 if (HasCarryIn &&
579 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
580 AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 I.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
588 MachineInstr &I) const {
589 MachineBasicBlock *BB = I.getParent();
590 MachineFunction *MF = BB->getParent();
591 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
592 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
593 MRI->use_nodbg_empty(I.getOperand(1).getReg());
594
595 unsigned Opc;
596 if (Subtarget->hasMADIntraFwdBug())
597 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
598 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
599 else if (UseNoCarry)
600 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
601 : AMDGPU::V_MAD_NC_I64_I32_e64;
602 else
603 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
604
605 if (UseNoCarry)
606 I.removeOperand(1);
607
608 I.setDesc(TII.get(Opc));
609 I.addOperand(*MF, MachineOperand::CreateImm(0));
610 I.addImplicitDefUseOperands(*MF);
611 I.getOperand(0).setIsEarlyClobber(true);
612 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
613}
614
615// TODO: We should probably legalize these to only using 32-bit results.
616bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
617 MachineBasicBlock *BB = I.getParent();
618 Register DstReg = I.getOperand(0).getReg();
619 Register SrcReg = I.getOperand(1).getReg();
620 LLT DstTy = MRI->getType(DstReg);
621 LLT SrcTy = MRI->getType(SrcReg);
622 const unsigned SrcSize = SrcTy.getSizeInBits();
623 unsigned DstSize = DstTy.getSizeInBits();
624
625 // TODO: Should handle any multiple of 32 offset.
626 unsigned Offset = I.getOperand(2).getImm();
627 if (Offset % 32 != 0 || DstSize > 128)
628 return false;
629
630 // 16-bit operations really use 32-bit registers.
631 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
632 if (DstSize == 16)
633 DstSize = 32;
634
635 const TargetRegisterClass *DstRC =
636 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
637 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
638 return false;
639
640 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
641 const TargetRegisterClass *SrcRC =
642 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
643 if (!SrcRC)
644 return false;
646 DstSize / 32);
647 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
648 if (!SrcRC)
649 return false;
650
651 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
652 *SrcRC, I.getOperand(1));
653 const DebugLoc &DL = I.getDebugLoc();
654 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
655 .addReg(SrcReg, 0, SubReg);
656
657 I.eraseFromParent();
658 return true;
659}
660
661bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
662 MachineBasicBlock *BB = MI.getParent();
663 Register DstReg = MI.getOperand(0).getReg();
664 LLT DstTy = MRI->getType(DstReg);
665 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
666
667 const unsigned SrcSize = SrcTy.getSizeInBits();
668 if (SrcSize < 32)
669 return selectImpl(MI, *CoverageInfo);
670
671 const DebugLoc &DL = MI.getDebugLoc();
672 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
673 const unsigned DstSize = DstTy.getSizeInBits();
674 const TargetRegisterClass *DstRC =
675 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
676 if (!DstRC)
677 return false;
678
679 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
680 MachineInstrBuilder MIB =
681 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
682 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
683 MachineOperand &Src = MI.getOperand(I + 1);
684 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
685 MIB.addImm(SubRegs[I]);
686
687 const TargetRegisterClass *SrcRC
688 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
689 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
690 return false;
691 }
692
693 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
694 return false;
695
696 MI.eraseFromParent();
697 return true;
698}
699
700bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
701 MachineBasicBlock *BB = MI.getParent();
702 const int NumDst = MI.getNumOperands() - 1;
703
704 MachineOperand &Src = MI.getOperand(NumDst);
705
706 Register SrcReg = Src.getReg();
707 Register DstReg0 = MI.getOperand(0).getReg();
708 LLT DstTy = MRI->getType(DstReg0);
709 LLT SrcTy = MRI->getType(SrcReg);
710
711 const unsigned DstSize = DstTy.getSizeInBits();
712 const unsigned SrcSize = SrcTy.getSizeInBits();
713 const DebugLoc &DL = MI.getDebugLoc();
714 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
715
716 const TargetRegisterClass *SrcRC =
717 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
718 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
719 return false;
720
721 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
722 // source, and this relies on the fact that the same subregister indices are
723 // used for both.
724 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
725 for (int I = 0, E = NumDst; I != E; ++I) {
726 MachineOperand &Dst = MI.getOperand(I);
727 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
728 .addReg(SrcReg, 0, SubRegs[I]);
729
730 // Make sure the subregister index is valid for the source register.
731 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
732 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
733 return false;
734
735 const TargetRegisterClass *DstRC =
736 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
737 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
738 return false;
739 }
740
741 MI.eraseFromParent();
742 return true;
743}
744
745bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
746 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
747 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
748
749 Register Src0 = MI.getOperand(1).getReg();
750 Register Src1 = MI.getOperand(2).getReg();
751 LLT SrcTy = MRI->getType(Src0);
752 const unsigned SrcSize = SrcTy.getSizeInBits();
753
754 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
755 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
756 return selectG_MERGE_VALUES(MI);
757 }
758
759 // Selection logic below is for V2S16 only.
760 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
761 Register Dst = MI.getOperand(0).getReg();
762 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
763 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
764 SrcTy != LLT::scalar(32)))
765 return selectImpl(MI, *CoverageInfo);
766
767 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
768 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
769 return false;
770
771 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
772 DstBank->getID() == AMDGPU::VGPRRegBankID);
773 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
774
775 const DebugLoc &DL = MI.getDebugLoc();
776 MachineBasicBlock *BB = MI.getParent();
777
778 // First, before trying TableGen patterns, check if both sources are
779 // constants. In those cases, we can trivially compute the final constant
780 // and emit a simple move.
781 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
782 if (ConstSrc1) {
783 auto ConstSrc0 =
784 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
785 if (ConstSrc0) {
786 const int64_t K0 = ConstSrc0->Value.getSExtValue();
787 const int64_t K1 = ConstSrc1->Value.getSExtValue();
788 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
789 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
790 uint32_t Imm = Lo16 | (Hi16 << 16);
791
792 // VALU
793 if (IsVector) {
794 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
795 MI.eraseFromParent();
796 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
797 }
798
799 // SALU
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
803 }
804 }
805
806 // Now try TableGen patterns.
807 if (selectImpl(MI, *CoverageInfo))
808 return true;
809
810 // TODO: This should probably be a combine somewhere
811 // (build_vector $src0, undef) -> copy $src0
812 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
813 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
814 MI.setDesc(TII.get(AMDGPU::COPY));
815 MI.removeOperand(2);
816 const auto &RC =
817 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
818 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
819 RBI.constrainGenericRegister(Src0, RC, *MRI);
820 }
821
822 // TODO: Can be improved?
823 if (IsVector) {
824 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
825 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
826 .addImm(0xFFFF)
827 .addReg(Src0);
828 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
829 return false;
830
831 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
832 .addReg(Src1)
833 .addImm(16)
834 .addReg(TmpReg);
835 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
836 return false;
837
838 MI.eraseFromParent();
839 return true;
840 }
841
842 Register ShiftSrc0;
843 Register ShiftSrc1;
844
845 // With multiple uses of the shift, this will duplicate the shift and
846 // increase register pressure.
847 //
848 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
849 // => (S_PACK_HH_B32_B16 $src0, $src1)
850 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
851 // => (S_PACK_HL_B32_B16 $src0, $src1)
852 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
853 // => (S_PACK_LH_B32_B16 $src0, $src1)
854 // (build_vector $src0, $src1)
855 // => (S_PACK_LL_B32_B16 $src0, $src1)
856
857 bool Shift0 = mi_match(
858 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
859
860 bool Shift1 = mi_match(
861 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
862
863 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
864 if (Shift0 && Shift1) {
865 Opc = AMDGPU::S_PACK_HH_B32_B16;
866 MI.getOperand(1).setReg(ShiftSrc0);
867 MI.getOperand(2).setReg(ShiftSrc1);
868 } else if (Shift1) {
869 Opc = AMDGPU::S_PACK_LH_B32_B16;
870 MI.getOperand(2).setReg(ShiftSrc1);
871 } else if (Shift0) {
872 auto ConstSrc1 =
873 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
874 if (ConstSrc1 && ConstSrc1->Value == 0) {
875 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
876 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
877 .addReg(ShiftSrc0)
878 .addImm(16)
879 .setOperandDead(3); // Dead scc
880
881 MI.eraseFromParent();
882 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
883 }
884 if (STI.hasSPackHL()) {
885 Opc = AMDGPU::S_PACK_HL_B32_B16;
886 MI.getOperand(1).setReg(ShiftSrc0);
887 }
888 }
889
890 MI.setDesc(TII.get(Opc));
891 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
892}
893
894bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
895 const MachineOperand &MO = I.getOperand(0);
896
897 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
898 // regbank check here is to know why getConstrainedRegClassForOperand failed.
899 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
900 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
901 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
902 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
903 return true;
904 }
905
906 return false;
907}
908
909bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
910 MachineBasicBlock *BB = I.getParent();
911
912 Register DstReg = I.getOperand(0).getReg();
913 Register Src0Reg = I.getOperand(1).getReg();
914 Register Src1Reg = I.getOperand(2).getReg();
915 LLT Src1Ty = MRI->getType(Src1Reg);
916
917 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
918 unsigned InsSize = Src1Ty.getSizeInBits();
919
920 int64_t Offset = I.getOperand(3).getImm();
921
922 // FIXME: These cases should have been illegal and unnecessary to check here.
923 if (Offset % 32 != 0 || InsSize % 32 != 0)
924 return false;
925
926 // Currently not handled by getSubRegFromChannel.
927 if (InsSize > 128)
928 return false;
929
930 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
931 if (SubReg == AMDGPU::NoSubRegister)
932 return false;
933
934 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
935 const TargetRegisterClass *DstRC =
936 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
937 if (!DstRC)
938 return false;
939
940 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
941 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
942 const TargetRegisterClass *Src0RC =
943 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
944 const TargetRegisterClass *Src1RC =
945 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
946
947 // Deal with weird cases where the class only partially supports the subreg
948 // index.
949 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
950 if (!Src0RC || !Src1RC)
951 return false;
952
953 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
954 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
955 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
956 return false;
957
958 const DebugLoc &DL = I.getDebugLoc();
959 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
960 .addReg(Src0Reg)
961 .addReg(Src1Reg)
962 .addImm(SubReg);
963
964 I.eraseFromParent();
965 return true;
966}
967
968bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
969 Register DstReg = MI.getOperand(0).getReg();
970 Register SrcReg = MI.getOperand(1).getReg();
971 Register OffsetReg = MI.getOperand(2).getReg();
972 Register WidthReg = MI.getOperand(3).getReg();
973
974 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
975 "scalar BFX instructions are expanded in regbankselect");
976 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
977 "64-bit vector BFX instructions are expanded in regbankselect");
978
979 const DebugLoc &DL = MI.getDebugLoc();
980 MachineBasicBlock *MBB = MI.getParent();
981
982 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
983 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
984 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
985 .addReg(SrcReg)
986 .addReg(OffsetReg)
987 .addReg(WidthReg);
988 MI.eraseFromParent();
989 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
990}
991
992bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
993 if (STI.getLDSBankCount() != 16)
994 return selectImpl(MI, *CoverageInfo);
995
996 Register Dst = MI.getOperand(0).getReg();
997 Register Src0 = MI.getOperand(2).getReg();
998 Register M0Val = MI.getOperand(6).getReg();
999 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1000 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1001 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1002 return false;
1003
1004 // This requires 2 instructions. It is possible to write a pattern to support
1005 // this, but the generated isel emitter doesn't correctly deal with multiple
1006 // output instructions using the same physical register input. The copy to m0
1007 // is incorrectly placed before the second instruction.
1008 //
1009 // TODO: Match source modifiers.
1010
1011 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 const DebugLoc &DL = MI.getDebugLoc();
1013 MachineBasicBlock *MBB = MI.getParent();
1014
1015 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1016 .addReg(M0Val);
1017 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1018 .addImm(2)
1019 .addImm(MI.getOperand(4).getImm()) // $attr
1020 .addImm(MI.getOperand(3).getImm()); // $attrchan
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1023 .addImm(0) // $src0_modifiers
1024 .addReg(Src0) // $src0
1025 .addImm(MI.getOperand(4).getImm()) // $attr
1026 .addImm(MI.getOperand(3).getImm()) // $attrchan
1027 .addImm(0) // $src2_modifiers
1028 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1029 .addImm(MI.getOperand(5).getImm()) // $high
1030 .addImm(0) // $clamp
1031 .addImm(0); // $omod
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037// Writelane is special in that it can use SGPR and M0 (which would normally
1038// count as using the constant bus twice - but in this case it is allowed since
1039// the lane selector doesn't count as a use of the constant bus). However, it is
1040// still required to abide by the 1 SGPR rule. Fix this up if we might have
1041// multiple SGPRs.
1042bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1043 // With a constant bus limit of at least 2, there's no issue.
1044 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1045 return selectImpl(MI, *CoverageInfo);
1046
1047 MachineBasicBlock *MBB = MI.getParent();
1048 const DebugLoc &DL = MI.getDebugLoc();
1049 Register VDst = MI.getOperand(0).getReg();
1050 Register Val = MI.getOperand(2).getReg();
1051 Register LaneSelect = MI.getOperand(3).getReg();
1052 Register VDstIn = MI.getOperand(4).getReg();
1053
1054 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1055
1056 std::optional<ValueAndVReg> ConstSelect =
1057 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1058 if (ConstSelect) {
1059 // The selector has to be an inline immediate, so we can use whatever for
1060 // the other operands.
1061 MIB.addReg(Val);
1062 MIB.addImm(ConstSelect->Value.getSExtValue() &
1063 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1064 } else {
1065 std::optional<ValueAndVReg> ConstVal =
1067
1068 // If the value written is an inline immediate, we can get away without a
1069 // copy to m0.
1070 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1071 STI.hasInv2PiInlineImm())) {
1072 MIB.addImm(ConstVal->Value.getSExtValue());
1073 MIB.addReg(LaneSelect);
1074 } else {
1075 MIB.addReg(Val);
1076
1077 // If the lane selector was originally in a VGPR and copied with
1078 // readfirstlane, there's a hazard to read the same SGPR from the
1079 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1080 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1081
1082 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1083 .addReg(LaneSelect);
1084 MIB.addReg(AMDGPU::M0);
1085 }
1086 }
1087
1088 MIB.addReg(VDstIn);
1089
1090 MI.eraseFromParent();
1091 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1092}
1093
1094// We need to handle this here because tablegen doesn't support matching
1095// instructions with multiple outputs.
1096bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1097 Register Dst0 = MI.getOperand(0).getReg();
1098 Register Dst1 = MI.getOperand(1).getReg();
1099
1100 LLT Ty = MRI->getType(Dst0);
1101 unsigned Opc;
1102 if (Ty == LLT::scalar(32))
1103 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1104 else if (Ty == LLT::scalar(64))
1105 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1106 else
1107 return false;
1108
1109 // TODO: Match source modifiers.
1110
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 MachineBasicBlock *MBB = MI.getParent();
1113
1114 Register Numer = MI.getOperand(3).getReg();
1115 Register Denom = MI.getOperand(4).getReg();
1116 unsigned ChooseDenom = MI.getOperand(5).getImm();
1117
1118 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1119
1120 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1121 .addDef(Dst1)
1122 .addImm(0) // $src0_modifiers
1123 .addUse(Src0) // $src0
1124 .addImm(0) // $src1_modifiers
1125 .addUse(Denom) // $src1
1126 .addImm(0) // $src2_modifiers
1127 .addUse(Numer) // $src2
1128 .addImm(0) // $clamp
1129 .addImm(0); // $omod
1130
1131 MI.eraseFromParent();
1132 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1133}
1134
1135bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1136 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1137 switch (IntrinsicID) {
1138 case Intrinsic::amdgcn_if_break: {
1139 MachineBasicBlock *BB = I.getParent();
1140
1141 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1142 // SelectionDAG uses for wave32 vs wave64.
1143 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1144 .add(I.getOperand(0))
1145 .add(I.getOperand(2))
1146 .add(I.getOperand(3));
1147
1148 Register DstReg = I.getOperand(0).getReg();
1149 Register Src0Reg = I.getOperand(2).getReg();
1150 Register Src1Reg = I.getOperand(3).getReg();
1151
1152 I.eraseFromParent();
1153
1154 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1155 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1156
1157 return true;
1158 }
1159 case Intrinsic::amdgcn_interp_p1_f16:
1160 return selectInterpP1F16(I);
1161 case Intrinsic::amdgcn_wqm:
1162 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1163 case Intrinsic::amdgcn_softwqm:
1164 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1165 case Intrinsic::amdgcn_strict_wwm:
1166 case Intrinsic::amdgcn_wwm:
1167 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1168 case Intrinsic::amdgcn_strict_wqm:
1169 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1170 case Intrinsic::amdgcn_writelane:
1171 return selectWritelane(I);
1172 case Intrinsic::amdgcn_div_scale:
1173 return selectDivScale(I);
1174 case Intrinsic::amdgcn_icmp:
1175 case Intrinsic::amdgcn_fcmp:
1176 if (selectImpl(I, *CoverageInfo))
1177 return true;
1178 return selectIntrinsicCmp(I);
1179 case Intrinsic::amdgcn_ballot:
1180 return selectBallot(I);
1181 case Intrinsic::amdgcn_reloc_constant:
1182 return selectRelocConstant(I);
1183 case Intrinsic::amdgcn_groupstaticsize:
1184 return selectGroupStaticSize(I);
1185 case Intrinsic::returnaddress:
1186 return selectReturnAddress(I);
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1189 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1190 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1191 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1192 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1200 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1201 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1202 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1204 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1205 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1206 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1207 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1208 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1209 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1212 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1215 return selectSMFMACIntrin(I);
1216 case Intrinsic::amdgcn_permlane16_swap:
1217 case Intrinsic::amdgcn_permlane32_swap:
1218 return selectPermlaneSwapIntrin(I, IntrinsicID);
1219 case Intrinsic::amdgcn_wave_shuffle:
1220 return selectWaveShuffleIntrin(I);
1221 default:
1222 return selectImpl(I, *CoverageInfo);
1223 }
1224}
1225
1227 const GCNSubtarget &ST) {
1228 if (Size != 16 && Size != 32 && Size != 64)
1229 return -1;
1230
1231 if (Size == 16 && !ST.has16BitInsts())
1232 return -1;
1233
1234 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1235 unsigned FakeS16Opc, unsigned S32Opc,
1236 unsigned S64Opc) {
1237 if (Size == 16)
1238 return ST.hasTrue16BitInsts()
1239 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1240 : S16Opc;
1241 if (Size == 32)
1242 return S32Opc;
1243 return S64Opc;
1244 };
1245
1246 switch (P) {
1247 default:
1248 llvm_unreachable("Unknown condition code!");
1249 case CmpInst::ICMP_NE:
1250 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1251 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1252 AMDGPU::V_CMP_NE_U64_e64);
1253 case CmpInst::ICMP_EQ:
1254 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1255 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1256 AMDGPU::V_CMP_EQ_U64_e64);
1257 case CmpInst::ICMP_SGT:
1258 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1259 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1260 AMDGPU::V_CMP_GT_I64_e64);
1261 case CmpInst::ICMP_SGE:
1262 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1263 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1264 AMDGPU::V_CMP_GE_I64_e64);
1265 case CmpInst::ICMP_SLT:
1266 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1267 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1268 AMDGPU::V_CMP_LT_I64_e64);
1269 case CmpInst::ICMP_SLE:
1270 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1271 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1272 AMDGPU::V_CMP_LE_I64_e64);
1273 case CmpInst::ICMP_UGT:
1274 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1275 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1276 AMDGPU::V_CMP_GT_U64_e64);
1277 case CmpInst::ICMP_UGE:
1278 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1279 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1280 AMDGPU::V_CMP_GE_U64_e64);
1281 case CmpInst::ICMP_ULT:
1282 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1283 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1284 AMDGPU::V_CMP_LT_U64_e64);
1285 case CmpInst::ICMP_ULE:
1286 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1287 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1288 AMDGPU::V_CMP_LE_U64_e64);
1289
1290 case CmpInst::FCMP_OEQ:
1291 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1292 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1293 AMDGPU::V_CMP_EQ_F64_e64);
1294 case CmpInst::FCMP_OGT:
1295 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1296 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1297 AMDGPU::V_CMP_GT_F64_e64);
1298 case CmpInst::FCMP_OGE:
1299 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1300 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1301 AMDGPU::V_CMP_GE_F64_e64);
1302 case CmpInst::FCMP_OLT:
1303 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1304 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1305 AMDGPU::V_CMP_LT_F64_e64);
1306 case CmpInst::FCMP_OLE:
1307 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1308 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1309 AMDGPU::V_CMP_LE_F64_e64);
1310 case CmpInst::FCMP_ONE:
1311 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1312 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1313 AMDGPU::V_CMP_NEQ_F64_e64);
1314 case CmpInst::FCMP_ORD:
1315 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1316 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1317 AMDGPU::V_CMP_O_F64_e64);
1318 case CmpInst::FCMP_UNO:
1319 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1320 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1321 AMDGPU::V_CMP_U_F64_e64);
1322 case CmpInst::FCMP_UEQ:
1323 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1324 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1325 AMDGPU::V_CMP_NLG_F64_e64);
1326 case CmpInst::FCMP_UGT:
1327 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1328 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1329 AMDGPU::V_CMP_NLE_F64_e64);
1330 case CmpInst::FCMP_UGE:
1331 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1332 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1333 AMDGPU::V_CMP_NLT_F64_e64);
1334 case CmpInst::FCMP_ULT:
1335 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1336 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1337 AMDGPU::V_CMP_NGE_F64_e64);
1338 case CmpInst::FCMP_ULE:
1339 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1340 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1341 AMDGPU::V_CMP_NGT_F64_e64);
1342 case CmpInst::FCMP_UNE:
1343 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1344 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1345 AMDGPU::V_CMP_NEQ_F64_e64);
1346 case CmpInst::FCMP_TRUE:
1347 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1348 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1349 AMDGPU::V_CMP_TRU_F64_e64);
1351 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1352 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1353 AMDGPU::V_CMP_F_F64_e64);
1354 }
1355}
1356
1357int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1358 unsigned Size) const {
1359 if (Size == 64) {
1360 if (!STI.hasScalarCompareEq64())
1361 return -1;
1362
1363 switch (P) {
1364 case CmpInst::ICMP_NE:
1365 return AMDGPU::S_CMP_LG_U64;
1366 case CmpInst::ICMP_EQ:
1367 return AMDGPU::S_CMP_EQ_U64;
1368 default:
1369 return -1;
1370 }
1371 }
1372
1373 if (Size == 32) {
1374 switch (P) {
1375 case CmpInst::ICMP_NE:
1376 return AMDGPU::S_CMP_LG_U32;
1377 case CmpInst::ICMP_EQ:
1378 return AMDGPU::S_CMP_EQ_U32;
1379 case CmpInst::ICMP_SGT:
1380 return AMDGPU::S_CMP_GT_I32;
1381 case CmpInst::ICMP_SGE:
1382 return AMDGPU::S_CMP_GE_I32;
1383 case CmpInst::ICMP_SLT:
1384 return AMDGPU::S_CMP_LT_I32;
1385 case CmpInst::ICMP_SLE:
1386 return AMDGPU::S_CMP_LE_I32;
1387 case CmpInst::ICMP_UGT:
1388 return AMDGPU::S_CMP_GT_U32;
1389 case CmpInst::ICMP_UGE:
1390 return AMDGPU::S_CMP_GE_U32;
1391 case CmpInst::ICMP_ULT:
1392 return AMDGPU::S_CMP_LT_U32;
1393 case CmpInst::ICMP_ULE:
1394 return AMDGPU::S_CMP_LE_U32;
1395 case CmpInst::FCMP_OEQ:
1396 return AMDGPU::S_CMP_EQ_F32;
1397 case CmpInst::FCMP_OGT:
1398 return AMDGPU::S_CMP_GT_F32;
1399 case CmpInst::FCMP_OGE:
1400 return AMDGPU::S_CMP_GE_F32;
1401 case CmpInst::FCMP_OLT:
1402 return AMDGPU::S_CMP_LT_F32;
1403 case CmpInst::FCMP_OLE:
1404 return AMDGPU::S_CMP_LE_F32;
1405 case CmpInst::FCMP_ONE:
1406 return AMDGPU::S_CMP_LG_F32;
1407 case CmpInst::FCMP_ORD:
1408 return AMDGPU::S_CMP_O_F32;
1409 case CmpInst::FCMP_UNO:
1410 return AMDGPU::S_CMP_U_F32;
1411 case CmpInst::FCMP_UEQ:
1412 return AMDGPU::S_CMP_NLG_F32;
1413 case CmpInst::FCMP_UGT:
1414 return AMDGPU::S_CMP_NLE_F32;
1415 case CmpInst::FCMP_UGE:
1416 return AMDGPU::S_CMP_NLT_F32;
1417 case CmpInst::FCMP_ULT:
1418 return AMDGPU::S_CMP_NGE_F32;
1419 case CmpInst::FCMP_ULE:
1420 return AMDGPU::S_CMP_NGT_F32;
1421 case CmpInst::FCMP_UNE:
1422 return AMDGPU::S_CMP_NEQ_F32;
1423 default:
1424 llvm_unreachable("Unknown condition code!");
1425 }
1426 }
1427
1428 if (Size == 16) {
1429 if (!STI.hasSALUFloatInsts())
1430 return -1;
1431
1432 switch (P) {
1433 case CmpInst::FCMP_OEQ:
1434 return AMDGPU::S_CMP_EQ_F16;
1435 case CmpInst::FCMP_OGT:
1436 return AMDGPU::S_CMP_GT_F16;
1437 case CmpInst::FCMP_OGE:
1438 return AMDGPU::S_CMP_GE_F16;
1439 case CmpInst::FCMP_OLT:
1440 return AMDGPU::S_CMP_LT_F16;
1441 case CmpInst::FCMP_OLE:
1442 return AMDGPU::S_CMP_LE_F16;
1443 case CmpInst::FCMP_ONE:
1444 return AMDGPU::S_CMP_LG_F16;
1445 case CmpInst::FCMP_ORD:
1446 return AMDGPU::S_CMP_O_F16;
1447 case CmpInst::FCMP_UNO:
1448 return AMDGPU::S_CMP_U_F16;
1449 case CmpInst::FCMP_UEQ:
1450 return AMDGPU::S_CMP_NLG_F16;
1451 case CmpInst::FCMP_UGT:
1452 return AMDGPU::S_CMP_NLE_F16;
1453 case CmpInst::FCMP_UGE:
1454 return AMDGPU::S_CMP_NLT_F16;
1455 case CmpInst::FCMP_ULT:
1456 return AMDGPU::S_CMP_NGE_F16;
1457 case CmpInst::FCMP_ULE:
1458 return AMDGPU::S_CMP_NGT_F16;
1459 case CmpInst::FCMP_UNE:
1460 return AMDGPU::S_CMP_NEQ_F16;
1461 default:
1462 llvm_unreachable("Unknown condition code!");
1463 }
1464 }
1465
1466 return -1;
1467}
1468
1469bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1470
1471 MachineBasicBlock *BB = I.getParent();
1472 const DebugLoc &DL = I.getDebugLoc();
1473
1474 Register SrcReg = I.getOperand(2).getReg();
1475 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1476
1477 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1478
1479 Register CCReg = I.getOperand(0).getReg();
1480 if (!isVCC(CCReg, *MRI)) {
1481 int Opcode = getS_CMPOpcode(Pred, Size);
1482 if (Opcode == -1)
1483 return false;
1484 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1485 .add(I.getOperand(2))
1486 .add(I.getOperand(3));
1487 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1488 .addReg(AMDGPU::SCC);
1489 bool Ret =
1490 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1491 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1492 I.eraseFromParent();
1493 return Ret;
1494 }
1495
1496 if (I.getOpcode() == AMDGPU::G_FCMP)
1497 return false;
1498
1499 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1500 if (Opcode == -1)
1501 return false;
1502
1503 MachineInstrBuilder ICmp;
1504 // t16 instructions
1505 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1506 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1507 .addImm(0)
1508 .add(I.getOperand(2))
1509 .addImm(0)
1510 .add(I.getOperand(3))
1511 .addImm(0); // op_sel
1512 } else {
1513 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1514 .add(I.getOperand(2))
1515 .add(I.getOperand(3));
1516 }
1517
1518 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1519 *TRI.getBoolRC(), *MRI);
1520 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1521 I.eraseFromParent();
1522 return Ret;
1523}
1524
1525bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1526 Register Dst = I.getOperand(0).getReg();
1527 if (isVCC(Dst, *MRI))
1528 return false;
1529
1530 LLT DstTy = MRI->getType(Dst);
1531 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1532 return false;
1533
1534 MachineBasicBlock *BB = I.getParent();
1535 const DebugLoc &DL = I.getDebugLoc();
1536 Register SrcReg = I.getOperand(2).getReg();
1537 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1538
1539 // i1 inputs are not supported in GlobalISel.
1540 if (Size == 1)
1541 return false;
1542
1543 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1544 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1545 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1546 I.eraseFromParent();
1547 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1548 }
1549
1550 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1551 if (Opcode == -1)
1552 return false;
1553
1554 MachineInstrBuilder SelectedMI;
1555 MachineOperand &LHS = I.getOperand(2);
1556 MachineOperand &RHS = I.getOperand(3);
1557 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1558 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1559 Register Src0Reg =
1560 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1561 Register Src1Reg =
1562 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1563 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1564 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1565 SelectedMI.addImm(Src0Mods);
1566 SelectedMI.addReg(Src0Reg);
1567 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1568 SelectedMI.addImm(Src1Mods);
1569 SelectedMI.addReg(Src1Reg);
1570 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1571 SelectedMI.addImm(0); // clamp
1572 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1573 SelectedMI.addImm(0); // op_sel
1574
1575 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1576 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1577 return false;
1578
1579 I.eraseFromParent();
1580 return true;
1581}
1582
1583// Ballot has to zero bits in input lane-mask that are zero in current exec,
1584// Done as AND with exec. For inputs that are results of instruction that
1585// implicitly use same exec, for example compares in same basic block or SCC to
1586// VCC copy, use copy.
1589 MachineInstr *MI = MRI.getVRegDef(Reg);
1590 if (MI->getParent() != MBB)
1591 return false;
1592
1593 // Lane mask generated by SCC to VCC copy.
1594 if (MI->getOpcode() == AMDGPU::COPY) {
1595 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1596 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1597 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1598 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1599 return true;
1600 }
1601
1602 // Lane mask generated using compare with same exec.
1603 if (isa<GAnyCmp>(MI))
1604 return true;
1605
1606 Register LHS, RHS;
1607 // Look through AND.
1608 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1609 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1611
1612 return false;
1613}
1614
1615bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1616 MachineBasicBlock *BB = I.getParent();
1617 const DebugLoc &DL = I.getDebugLoc();
1618 Register DstReg = I.getOperand(0).getReg();
1619 Register SrcReg = I.getOperand(2).getReg();
1620 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1621 const unsigned WaveSize = STI.getWavefrontSize();
1622
1623 // In the common case, the return type matches the wave size.
1624 // However we also support emitting i64 ballots in wave32 mode.
1625 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1626 return false;
1627
1628 std::optional<ValueAndVReg> Arg =
1630
1631 Register Dst = DstReg;
1632 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1633 if (BallotSize != WaveSize) {
1634 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1635 }
1636
1637 if (Arg) {
1638 const int64_t Value = Arg->Value.getZExtValue();
1639 if (Value == 0) {
1640 // Dst = S_MOV 0
1641 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1642 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1643 } else {
1644 // Dst = COPY EXEC
1645 assert(Value == 1);
1646 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1647 }
1648 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1649 return false;
1650 } else {
1651 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1652 // Dst = COPY SrcReg
1653 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1654 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1655 return false;
1656 } else {
1657 // Dst = S_AND SrcReg, EXEC
1658 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1659 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1660 .addReg(SrcReg)
1661 .addReg(TRI.getExec())
1662 .setOperandDead(3); // Dead scc
1663 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1664 return false;
1665 }
1666 }
1667
1668 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1669 if (BallotSize != WaveSize) {
1670 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1671 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1672 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1673 .addReg(Dst)
1674 .addImm(AMDGPU::sub0)
1675 .addReg(HiReg)
1676 .addImm(AMDGPU::sub1);
1677 }
1678
1679 I.eraseFromParent();
1680 return true;
1681}
1682
1683bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1684 Register DstReg = I.getOperand(0).getReg();
1685 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1686 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1687 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1688 return false;
1689
1690 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1691
1692 Module *M = MF->getFunction().getParent();
1693 const MDNode *Metadata = I.getOperand(2).getMetadata();
1694 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1695 auto *RelocSymbol = cast<GlobalVariable>(
1696 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1697
1698 MachineBasicBlock *BB = I.getParent();
1699 BuildMI(*BB, &I, I.getDebugLoc(),
1700 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1702
1703 I.eraseFromParent();
1704 return true;
1705}
1706
1707bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1708 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1709
1710 Register DstReg = I.getOperand(0).getReg();
1711 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1712 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1713 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1714
1715 MachineBasicBlock *MBB = I.getParent();
1716 const DebugLoc &DL = I.getDebugLoc();
1717
1718 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1719
1720 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1721 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1722 MIB.addImm(MFI->getLDSSize());
1723 } else {
1724 Module *M = MF->getFunction().getParent();
1725 const GlobalValue *GV =
1726 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1728 }
1729
1730 I.eraseFromParent();
1731 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1732}
1733
1734bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1735 MachineBasicBlock *MBB = I.getParent();
1736 MachineFunction &MF = *MBB->getParent();
1737 const DebugLoc &DL = I.getDebugLoc();
1738
1739 MachineOperand &Dst = I.getOperand(0);
1740 Register DstReg = Dst.getReg();
1741 unsigned Depth = I.getOperand(2).getImm();
1742
1743 const TargetRegisterClass *RC
1744 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1745 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1746 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1747 return false;
1748
1749 // Check for kernel and shader functions
1750 if (Depth != 0 ||
1751 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1752 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1753 .addImm(0);
1754 I.eraseFromParent();
1755 return true;
1756 }
1757
1758 MachineFrameInfo &MFI = MF.getFrameInfo();
1759 // There is a call to @llvm.returnaddress in this function
1760 MFI.setReturnAddressIsTaken(true);
1761
1762 // Get the return address reg and mark it as an implicit live-in
1763 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1764 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1765 AMDGPU::SReg_64RegClass, DL);
1766 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1767 .addReg(LiveIn);
1768 I.eraseFromParent();
1769 return true;
1770}
1771
1772bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1773 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1774 // SelectionDAG uses for wave32 vs wave64.
1775 MachineBasicBlock *BB = MI.getParent();
1776 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1777 .add(MI.getOperand(1));
1778
1779 Register Reg = MI.getOperand(1).getReg();
1780 MI.eraseFromParent();
1781
1782 if (!MRI->getRegClassOrNull(Reg))
1783 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1784 return true;
1785}
1786
1787bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1788 MachineInstr &MI, Intrinsic::ID IntrID) const {
1789 MachineBasicBlock *MBB = MI.getParent();
1790 MachineFunction *MF = MBB->getParent();
1791 const DebugLoc &DL = MI.getDebugLoc();
1792
1793 unsigned IndexOperand = MI.getOperand(7).getImm();
1794 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1795 bool WaveDone = MI.getOperand(9).getImm() != 0;
1796
1797 if (WaveDone && !WaveRelease) {
1798 // TODO: Move this to IR verifier
1799 const Function &Fn = MF->getFunction();
1800 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1801 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1802 }
1803
1804 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1805 IndexOperand &= ~0x3f;
1806 unsigned CountDw = 0;
1807
1808 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1809 CountDw = (IndexOperand >> 24) & 0xf;
1810 IndexOperand &= ~(0xf << 24);
1811
1812 if (CountDw < 1 || CountDw > 4) {
1813 const Function &Fn = MF->getFunction();
1814 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1815 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1816 CountDw = 1;
1817 }
1818 }
1819
1820 if (IndexOperand) {
1821 const Function &Fn = MF->getFunction();
1822 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1823 Fn, "ds_ordered_count: bad index operand", DL));
1824 }
1825
1826 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1827 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1828
1829 unsigned Offset0 = OrderedCountIndex << 2;
1830 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1831
1832 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1833 Offset1 |= (CountDw - 1) << 6;
1834
1835 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1836 Offset1 |= ShaderType << 2;
1837
1838 unsigned Offset = Offset0 | (Offset1 << 8);
1839
1840 Register M0Val = MI.getOperand(2).getReg();
1841 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1842 .addReg(M0Val);
1843
1844 Register DstReg = MI.getOperand(0).getReg();
1845 Register ValReg = MI.getOperand(3).getReg();
1846 MachineInstrBuilder DS =
1847 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1848 .addReg(ValReg)
1849 .addImm(Offset)
1850 .cloneMemRefs(MI);
1851
1852 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1853 return false;
1854
1855 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1856 MI.eraseFromParent();
1857 return Ret;
1858}
1859
1860static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1861 switch (IntrID) {
1862 case Intrinsic::amdgcn_ds_gws_init:
1863 return AMDGPU::DS_GWS_INIT;
1864 case Intrinsic::amdgcn_ds_gws_barrier:
1865 return AMDGPU::DS_GWS_BARRIER;
1866 case Intrinsic::amdgcn_ds_gws_sema_v:
1867 return AMDGPU::DS_GWS_SEMA_V;
1868 case Intrinsic::amdgcn_ds_gws_sema_br:
1869 return AMDGPU::DS_GWS_SEMA_BR;
1870 case Intrinsic::amdgcn_ds_gws_sema_p:
1871 return AMDGPU::DS_GWS_SEMA_P;
1872 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1873 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1874 default:
1875 llvm_unreachable("not a gws intrinsic");
1876 }
1877}
1878
1879bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1880 Intrinsic::ID IID) const {
1881 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1882 !STI.hasGWSSemaReleaseAll()))
1883 return false;
1884
1885 // intrinsic ID, vsrc, offset
1886 const bool HasVSrc = MI.getNumOperands() == 3;
1887 assert(HasVSrc || MI.getNumOperands() == 2);
1888
1889 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1890 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1891 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1892 return false;
1893
1894 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1895 unsigned ImmOffset;
1896
1897 MachineBasicBlock *MBB = MI.getParent();
1898 const DebugLoc &DL = MI.getDebugLoc();
1899
1900 MachineInstr *Readfirstlane = nullptr;
1901
1902 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1903 // incoming offset, in case there's an add of a constant. We'll have to put it
1904 // back later.
1905 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1906 Readfirstlane = OffsetDef;
1907 BaseOffset = OffsetDef->getOperand(1).getReg();
1908 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1909 }
1910
1911 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1912 // If we have a constant offset, try to use the 0 in m0 as the base.
1913 // TODO: Look into changing the default m0 initialization value. If the
1914 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1915 // the immediate offset.
1916
1917 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1918 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1919 .addImm(0);
1920 } else {
1921 std::tie(BaseOffset, ImmOffset) =
1922 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1923
1924 if (Readfirstlane) {
1925 // We have the constant offset now, so put the readfirstlane back on the
1926 // variable component.
1927 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1928 return false;
1929
1930 Readfirstlane->getOperand(1).setReg(BaseOffset);
1931 BaseOffset = Readfirstlane->getOperand(0).getReg();
1932 } else {
1933 if (!RBI.constrainGenericRegister(BaseOffset,
1934 AMDGPU::SReg_32RegClass, *MRI))
1935 return false;
1936 }
1937
1938 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1939 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1940 .addReg(BaseOffset)
1941 .addImm(16)
1942 .setOperandDead(3); // Dead scc
1943
1944 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1945 .addReg(M0Base);
1946 }
1947
1948 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1949 // offset field) % 64. Some versions of the programming guide omit the m0
1950 // part, or claim it's from offset 0.
1951
1952 unsigned Opc = gwsIntrinToOpcode(IID);
1953 const MCInstrDesc &InstrDesc = TII.get(Opc);
1954
1955 if (HasVSrc) {
1956 Register VSrc = MI.getOperand(1).getReg();
1957
1958 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1959 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1960 const TargetRegisterClass *SubRC =
1961 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1962
1963 if (!SubRC) {
1964 // 32-bit normal case.
1965 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1966 return false;
1967
1968 BuildMI(*MBB, &MI, DL, InstrDesc)
1969 .addReg(VSrc)
1970 .addImm(ImmOffset)
1971 .cloneMemRefs(MI);
1972 } else {
1973 // Requires even register alignment, so create 64-bit value and pad the
1974 // top half with undef.
1975 Register DataReg = MRI->createVirtualRegister(DataRC);
1976 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1977 return false;
1978
1979 Register UndefReg = MRI->createVirtualRegister(SubRC);
1980 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1981 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1982 .addReg(VSrc)
1983 .addImm(AMDGPU::sub0)
1984 .addReg(UndefReg)
1985 .addImm(AMDGPU::sub1);
1986
1987 BuildMI(*MBB, &MI, DL, InstrDesc)
1988 .addReg(DataReg)
1989 .addImm(ImmOffset)
1990 .cloneMemRefs(MI);
1991 }
1992 } else {
1993 BuildMI(*MBB, &MI, DL, InstrDesc)
1994 .addImm(ImmOffset)
1995 .cloneMemRefs(MI);
1996 }
1997
1998 MI.eraseFromParent();
1999 return true;
2000}
2001
2002bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2003 bool IsAppend) const {
2004 Register PtrBase = MI.getOperand(2).getReg();
2005 LLT PtrTy = MRI->getType(PtrBase);
2006 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2007
2008 unsigned Offset;
2009 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2010
2011 // TODO: Should this try to look through readfirstlane like GWS?
2012 if (!isDSOffsetLegal(PtrBase, Offset)) {
2013 PtrBase = MI.getOperand(2).getReg();
2014 Offset = 0;
2015 }
2016
2017 MachineBasicBlock *MBB = MI.getParent();
2018 const DebugLoc &DL = MI.getDebugLoc();
2019 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2020
2021 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2022 .addReg(PtrBase);
2023 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2024 return false;
2025
2026 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2027 .addImm(Offset)
2028 .addImm(IsGDS ? -1 : 0)
2029 .cloneMemRefs(MI);
2030 MI.eraseFromParent();
2031 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2032}
2033
2034bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2035 MachineFunction *MF = MI.getMF();
2036 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2037
2038 MFInfo->setInitWholeWave();
2039 return selectImpl(MI, *CoverageInfo);
2040}
2041
2042static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2043 bool &IsTexFail) {
2044 if (TexFailCtrl)
2045 IsTexFail = true;
2046
2047 TFE = TexFailCtrl & 0x1;
2048 TexFailCtrl &= ~(uint64_t)0x1;
2049 LWE = TexFailCtrl & 0x2;
2050 TexFailCtrl &= ~(uint64_t)0x2;
2051
2052 return TexFailCtrl == 0;
2053}
2054
2055bool AMDGPUInstructionSelector::selectImageIntrinsic(
2056 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2057 MachineBasicBlock *MBB = MI.getParent();
2058 const DebugLoc &DL = MI.getDebugLoc();
2059 unsigned IntrOpcode = Intr->BaseOpcode;
2060
2061 // For image atomic: use no-return opcode if result is unused.
2062 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2063 Register ResultDef = MI.getOperand(0).getReg();
2064 if (MRI->use_nodbg_empty(ResultDef))
2065 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2066 }
2067
2068 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2070
2071 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2072 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2073 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2074 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2075
2076 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2077
2078 Register VDataIn = AMDGPU::NoRegister;
2079 Register VDataOut = AMDGPU::NoRegister;
2080 LLT VDataTy;
2081 int NumVDataDwords = -1;
2082 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2083 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2084
2085 bool Unorm;
2086 if (!BaseOpcode->Sampler)
2087 Unorm = true;
2088 else
2089 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2090
2091 bool TFE;
2092 bool LWE;
2093 bool IsTexFail = false;
2094 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2095 TFE, LWE, IsTexFail))
2096 return false;
2097
2098 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2099 const bool IsA16 = (Flags & 1) != 0;
2100 const bool IsG16 = (Flags & 2) != 0;
2101
2102 // A16 implies 16 bit gradients if subtarget doesn't support G16
2103 if (IsA16 && !STI.hasG16() && !IsG16)
2104 return false;
2105
2106 unsigned DMask = 0;
2107 unsigned DMaskLanes = 0;
2108
2109 if (BaseOpcode->Atomic) {
2110 if (!BaseOpcode->NoReturn)
2111 VDataOut = MI.getOperand(0).getReg();
2112 VDataIn = MI.getOperand(2).getReg();
2113 LLT Ty = MRI->getType(VDataIn);
2114
2115 // Be careful to allow atomic swap on 16-bit element vectors.
2116 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2117 Ty.getSizeInBits() == 128 :
2118 Ty.getSizeInBits() == 64;
2119
2120 if (BaseOpcode->AtomicX2) {
2121 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2122
2123 DMask = Is64Bit ? 0xf : 0x3;
2124 NumVDataDwords = Is64Bit ? 4 : 2;
2125 } else {
2126 DMask = Is64Bit ? 0x3 : 0x1;
2127 NumVDataDwords = Is64Bit ? 2 : 1;
2128 }
2129 } else {
2130 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2131 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2132
2133 if (BaseOpcode->Store) {
2134 VDataIn = MI.getOperand(1).getReg();
2135 VDataTy = MRI->getType(VDataIn);
2136 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2137 } else if (BaseOpcode->NoReturn) {
2138 NumVDataDwords = 0;
2139 } else {
2140 VDataOut = MI.getOperand(0).getReg();
2141 VDataTy = MRI->getType(VDataOut);
2142 NumVDataDwords = DMaskLanes;
2143
2144 if (IsD16 && !STI.hasUnpackedD16VMem())
2145 NumVDataDwords = (DMaskLanes + 1) / 2;
2146 }
2147 }
2148
2149 // Set G16 opcode
2150 if (Subtarget->hasG16() && IsG16) {
2151 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2153 assert(G16MappingInfo);
2154 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2155 }
2156
2157 // TODO: Check this in verifier.
2158 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2159
2160 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2161 // Keep GLC only when the atomic's result is actually used.
2162 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2164 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2166 return false;
2167
2168 int NumVAddrRegs = 0;
2169 int NumVAddrDwords = 0;
2170 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2171 // Skip the $noregs and 0s inserted during legalization.
2172 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2173 if (!AddrOp.isReg())
2174 continue; // XXX - Break?
2175
2176 Register Addr = AddrOp.getReg();
2177 if (!Addr)
2178 break;
2179
2180 ++NumVAddrRegs;
2181 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2182 }
2183
2184 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2185 // NSA, these should have been packed into a single value in the first
2186 // address register
2187 const bool UseNSA =
2188 NumVAddrRegs != 1 &&
2189 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2190 : NumVAddrDwords == NumVAddrRegs);
2191 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2192 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2193 return false;
2194 }
2195
2196 if (IsTexFail)
2197 ++NumVDataDwords;
2198
2199 int Opcode = -1;
2200 if (IsGFX12Plus) {
2201 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2202 NumVDataDwords, NumVAddrDwords);
2203 } else if (IsGFX11Plus) {
2204 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2205 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2206 : AMDGPU::MIMGEncGfx11Default,
2207 NumVDataDwords, NumVAddrDwords);
2208 } else if (IsGFX10Plus) {
2209 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2210 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2211 : AMDGPU::MIMGEncGfx10Default,
2212 NumVDataDwords, NumVAddrDwords);
2213 } else {
2214 if (Subtarget->hasGFX90AInsts()) {
2215 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2216 NumVDataDwords, NumVAddrDwords);
2217 if (Opcode == -1) {
2218 LLVM_DEBUG(
2219 dbgs()
2220 << "requested image instruction is not supported on this GPU\n");
2221 return false;
2222 }
2223 }
2224 if (Opcode == -1 &&
2225 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2226 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2227 NumVDataDwords, NumVAddrDwords);
2228 if (Opcode == -1)
2229 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2230 NumVDataDwords, NumVAddrDwords);
2231 }
2232 if (Opcode == -1)
2233 return false;
2234
2235 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2236 .cloneMemRefs(MI);
2237
2238 if (VDataOut) {
2239 if (BaseOpcode->AtomicX2) {
2240 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2241
2242 Register TmpReg = MRI->createVirtualRegister(
2243 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2244 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2245
2246 MIB.addDef(TmpReg);
2247 if (!MRI->use_empty(VDataOut)) {
2248 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2249 .addReg(TmpReg, RegState::Kill, SubReg);
2250 }
2251
2252 } else {
2253 MIB.addDef(VDataOut); // vdata output
2254 }
2255 }
2256
2257 if (VDataIn)
2258 MIB.addReg(VDataIn); // vdata input
2259
2260 for (int I = 0; I != NumVAddrRegs; ++I) {
2261 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2262 if (SrcOp.isReg()) {
2263 assert(SrcOp.getReg() != 0);
2264 MIB.addReg(SrcOp.getReg());
2265 }
2266 }
2267
2268 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2269 if (BaseOpcode->Sampler)
2270 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2271
2272 MIB.addImm(DMask); // dmask
2273
2274 if (IsGFX10Plus)
2275 MIB.addImm(DimInfo->Encoding);
2276 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2277 MIB.addImm(Unorm);
2278
2279 MIB.addImm(CPol);
2280 MIB.addImm(IsA16 && // a16 or r128
2281 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2282 if (IsGFX10Plus)
2283 MIB.addImm(IsA16 ? -1 : 0);
2284
2285 if (!Subtarget->hasGFX90AInsts()) {
2286 MIB.addImm(TFE); // tfe
2287 } else if (TFE) {
2288 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2289 return false;
2290 }
2291
2292 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2293 MIB.addImm(LWE); // lwe
2294 if (!IsGFX10Plus)
2295 MIB.addImm(DimInfo->DA ? -1 : 0);
2296 if (BaseOpcode->HasD16)
2297 MIB.addImm(IsD16 ? -1 : 0);
2298
2299 MI.eraseFromParent();
2300 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2301 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2302 return true;
2303}
2304
2305// We need to handle this here because tablegen doesn't support matching
2306// instructions with multiple outputs.
2307bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2308 MachineInstr &MI) const {
2309 Register Dst0 = MI.getOperand(0).getReg();
2310 Register Dst1 = MI.getOperand(1).getReg();
2311
2312 const DebugLoc &DL = MI.getDebugLoc();
2313 MachineBasicBlock *MBB = MI.getParent();
2314
2315 Register Addr = MI.getOperand(3).getReg();
2316 Register Data0 = MI.getOperand(4).getReg();
2317 Register Data1 = MI.getOperand(5).getReg();
2318 unsigned Offset = MI.getOperand(6).getImm();
2319
2320 unsigned Opc;
2321 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2322 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2323 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2324 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2325 break;
2326 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2327 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2328 break;
2329 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2330 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2331 break;
2332 }
2333
2334 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2335 .addDef(Dst1)
2336 .addUse(Addr)
2337 .addUse(Data0)
2338 .addUse(Data1)
2339 .addImm(Offset)
2340 .cloneMemRefs(MI);
2341
2342 MI.eraseFromParent();
2343 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2344}
2345
2346bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2347 MachineInstr &I) const {
2348 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2349 switch (IntrinsicID) {
2350 case Intrinsic::amdgcn_end_cf:
2351 return selectEndCfIntrinsic(I);
2352 case Intrinsic::amdgcn_ds_ordered_add:
2353 case Intrinsic::amdgcn_ds_ordered_swap:
2354 return selectDSOrderedIntrinsic(I, IntrinsicID);
2355 case Intrinsic::amdgcn_ds_gws_init:
2356 case Intrinsic::amdgcn_ds_gws_barrier:
2357 case Intrinsic::amdgcn_ds_gws_sema_v:
2358 case Intrinsic::amdgcn_ds_gws_sema_br:
2359 case Intrinsic::amdgcn_ds_gws_sema_p:
2360 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2361 return selectDSGWSIntrinsic(I, IntrinsicID);
2362 case Intrinsic::amdgcn_ds_append:
2363 return selectDSAppendConsume(I, true);
2364 case Intrinsic::amdgcn_ds_consume:
2365 return selectDSAppendConsume(I, false);
2366 case Intrinsic::amdgcn_init_whole_wave:
2367 return selectInitWholeWave(I);
2368 case Intrinsic::amdgcn_raw_buffer_load_lds:
2369 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2370 case Intrinsic::amdgcn_struct_buffer_load_lds:
2371 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2372 return selectBufferLoadLds(I);
2373 // Until we can store both the address space of the global and the LDS
2374 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2375 // that the argument is a global pointer (buffer pointers have been handled by
2376 // a LLVM IR-level lowering).
2377 case Intrinsic::amdgcn_load_to_lds:
2378 case Intrinsic::amdgcn_global_load_lds:
2379 return selectGlobalLoadLds(I);
2380 case Intrinsic::amdgcn_exp_compr:
2381 if (!STI.hasCompressedExport()) {
2382 Function &F = I.getMF()->getFunction();
2383 F.getContext().diagnose(
2384 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2385 I.getDebugLoc(), DS_Error));
2386 return false;
2387 }
2388 break;
2389 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2390 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2391 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2392 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2393 return selectDSBvhStackIntrinsic(I);
2394 case Intrinsic::amdgcn_s_barrier_init:
2395 case Intrinsic::amdgcn_s_barrier_signal_var:
2396 return selectNamedBarrierInit(I, IntrinsicID);
2397 case Intrinsic::amdgcn_s_wakeup_barrier: {
2398 if (!STI.hasSWakeupBarrier()) {
2399 Function &F = I.getMF()->getFunction();
2400 F.getContext().diagnose(
2401 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2402 I.getDebugLoc(), DS_Error));
2403 return false;
2404 }
2405 return selectNamedBarrierInst(I, IntrinsicID);
2406 }
2407 case Intrinsic::amdgcn_s_barrier_join:
2408 case Intrinsic::amdgcn_s_get_named_barrier_state:
2409 return selectNamedBarrierInst(I, IntrinsicID);
2410 case Intrinsic::amdgcn_s_get_barrier_state:
2411 return selectSGetBarrierState(I, IntrinsicID);
2412 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2413 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2414 }
2415 return selectImpl(I, *CoverageInfo);
2416}
2417
2418bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2419 if (selectImpl(I, *CoverageInfo))
2420 return true;
2421
2422 MachineBasicBlock *BB = I.getParent();
2423 const DebugLoc &DL = I.getDebugLoc();
2424
2425 Register DstReg = I.getOperand(0).getReg();
2426 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2427 assert(Size <= 32 || Size == 64);
2428 const MachineOperand &CCOp = I.getOperand(1);
2429 Register CCReg = CCOp.getReg();
2430 if (!isVCC(CCReg, *MRI)) {
2431 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2432 AMDGPU::S_CSELECT_B32;
2433 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2434 .addReg(CCReg);
2435
2436 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2437 // bank, because it does not cover the register class that we used to represent
2438 // for it. So we need to manually set the register class here.
2439 if (!MRI->getRegClassOrNull(CCReg))
2440 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2441 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2442 .add(I.getOperand(2))
2443 .add(I.getOperand(3));
2444
2445 bool Ret = false;
2446 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2447 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2448 I.eraseFromParent();
2449 return Ret;
2450 }
2451
2452 // Wide VGPR select should have been split in RegBankSelect.
2453 if (Size > 32)
2454 return false;
2455
2456 MachineInstr *Select =
2457 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2458 .addImm(0)
2459 .add(I.getOperand(3))
2460 .addImm(0)
2461 .add(I.getOperand(2))
2462 .add(I.getOperand(1));
2463
2464 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2465 I.eraseFromParent();
2466 return Ret;
2467}
2468
2469bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2470 Register DstReg = I.getOperand(0).getReg();
2471 Register SrcReg = I.getOperand(1).getReg();
2472 const LLT DstTy = MRI->getType(DstReg);
2473 const LLT SrcTy = MRI->getType(SrcReg);
2474 const LLT S1 = LLT::scalar(1);
2475
2476 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2477 const RegisterBank *DstRB;
2478 if (DstTy == S1) {
2479 // This is a special case. We don't treat s1 for legalization artifacts as
2480 // vcc booleans.
2481 DstRB = SrcRB;
2482 } else {
2483 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2484 if (SrcRB != DstRB)
2485 return false;
2486 }
2487
2488 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2489
2490 unsigned DstSize = DstTy.getSizeInBits();
2491 unsigned SrcSize = SrcTy.getSizeInBits();
2492
2493 const TargetRegisterClass *SrcRC =
2494 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2495 const TargetRegisterClass *DstRC =
2496 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2497 if (!SrcRC || !DstRC)
2498 return false;
2499
2500 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2501 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2502 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2503 return false;
2504 }
2505
2506 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2507 assert(STI.useRealTrue16Insts());
2508 const DebugLoc &DL = I.getDebugLoc();
2509 MachineBasicBlock *MBB = I.getParent();
2510 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2511 .addReg(SrcReg, 0, AMDGPU::lo16);
2512 I.eraseFromParent();
2513 return true;
2514 }
2515
2516 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2517 MachineBasicBlock *MBB = I.getParent();
2518 const DebugLoc &DL = I.getDebugLoc();
2519
2520 Register LoReg = MRI->createVirtualRegister(DstRC);
2521 Register HiReg = MRI->createVirtualRegister(DstRC);
2522 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2523 .addReg(SrcReg, 0, AMDGPU::sub0);
2524 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2525 .addReg(SrcReg, 0, AMDGPU::sub1);
2526
2527 if (IsVALU && STI.hasSDWA()) {
2528 // Write the low 16-bits of the high element into the high 16-bits of the
2529 // low element.
2530 MachineInstr *MovSDWA =
2531 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2532 .addImm(0) // $src0_modifiers
2533 .addReg(HiReg) // $src0
2534 .addImm(0) // $clamp
2535 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2536 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2537 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2538 .addReg(LoReg, RegState::Implicit);
2539 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2540 } else {
2541 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2542 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2543 Register ImmReg = MRI->createVirtualRegister(DstRC);
2544 if (IsVALU) {
2545 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2546 .addImm(16)
2547 .addReg(HiReg);
2548 } else {
2549 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2550 .addReg(HiReg)
2551 .addImm(16)
2552 .setOperandDead(3); // Dead scc
2553 }
2554
2555 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2556 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2557 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2558
2559 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2560 .addImm(0xffff);
2561 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2562 .addReg(LoReg)
2563 .addReg(ImmReg);
2564 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2565 .addReg(TmpReg0)
2566 .addReg(TmpReg1);
2567
2568 if (!IsVALU) {
2569 And.setOperandDead(3); // Dead scc
2570 Or.setOperandDead(3); // Dead scc
2571 }
2572 }
2573
2574 I.eraseFromParent();
2575 return true;
2576 }
2577
2578 if (!DstTy.isScalar())
2579 return false;
2580
2581 if (SrcSize > 32) {
2582 unsigned SubRegIdx = DstSize < 32
2583 ? static_cast<unsigned>(AMDGPU::sub0)
2584 : TRI.getSubRegFromChannel(0, DstSize / 32);
2585 if (SubRegIdx == AMDGPU::NoSubRegister)
2586 return false;
2587
2588 // Deal with weird cases where the class only partially supports the subreg
2589 // index.
2590 const TargetRegisterClass *SrcWithSubRC
2591 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2592 if (!SrcWithSubRC)
2593 return false;
2594
2595 if (SrcWithSubRC != SrcRC) {
2596 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2597 return false;
2598 }
2599
2600 I.getOperand(1).setSubReg(SubRegIdx);
2601 }
2602
2603 I.setDesc(TII.get(TargetOpcode::COPY));
2604 return true;
2605}
2606
2607/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2608static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2610 int SignedMask = static_cast<int>(Mask);
2611 return SignedMask >= -16 && SignedMask <= 64;
2612}
2613
2614// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2615const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2617 const TargetRegisterInfo &TRI) const {
2618 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2619 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2620 return RB;
2621
2622 // Ignore the type, since we don't use vcc in artifacts.
2623 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2624 return &RBI.getRegBankFromRegClass(*RC, LLT());
2625 return nullptr;
2626}
2627
2628bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2629 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2630 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2631 const DebugLoc &DL = I.getDebugLoc();
2632 MachineBasicBlock &MBB = *I.getParent();
2633 const Register DstReg = I.getOperand(0).getReg();
2634 const Register SrcReg = I.getOperand(1).getReg();
2635
2636 const LLT DstTy = MRI->getType(DstReg);
2637 const LLT SrcTy = MRI->getType(SrcReg);
2638 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2639 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2640 const unsigned DstSize = DstTy.getSizeInBits();
2641 if (!DstTy.isScalar())
2642 return false;
2643
2644 // Artifact casts should never use vcc.
2645 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2646
2647 // FIXME: This should probably be illegal and split earlier.
2648 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2649 if (DstSize <= 32)
2650 return selectCOPY(I);
2651
2652 const TargetRegisterClass *SrcRC =
2653 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2654 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2655 const TargetRegisterClass *DstRC =
2656 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2657
2658 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2659 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2660 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2661 .addReg(SrcReg)
2662 .addImm(AMDGPU::sub0)
2663 .addReg(UndefReg)
2664 .addImm(AMDGPU::sub1);
2665 I.eraseFromParent();
2666
2667 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2668 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2669 }
2670
2671 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2672 // 64-bit should have been split up in RegBankSelect
2673
2674 // Try to use an and with a mask if it will save code size.
2675 unsigned Mask;
2676 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2677 MachineInstr *ExtI =
2678 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2679 .addImm(Mask)
2680 .addReg(SrcReg);
2681 I.eraseFromParent();
2682 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2683 }
2684
2685 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2686 MachineInstr *ExtI =
2687 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2688 .addReg(SrcReg)
2689 .addImm(0) // Offset
2690 .addImm(SrcSize); // Width
2691 I.eraseFromParent();
2692 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2693 }
2694
2695 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2696 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2697 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2698 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2699 return false;
2700
2701 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2702 const unsigned SextOpc = SrcSize == 8 ?
2703 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2704 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2705 .addReg(SrcReg);
2706 I.eraseFromParent();
2707 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2708 }
2709
2710 // Using a single 32-bit SALU to calculate the high half is smaller than
2711 // S_BFE with a literal constant operand.
2712 if (DstSize > 32 && SrcSize == 32) {
2713 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2714 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2715 if (Signed) {
2716 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2717 .addReg(SrcReg, 0, SubReg)
2718 .addImm(31)
2719 .setOperandDead(3); // Dead scc
2720 } else {
2721 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2722 .addImm(0);
2723 }
2724 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2725 .addReg(SrcReg, 0, SubReg)
2726 .addImm(AMDGPU::sub0)
2727 .addReg(HiReg)
2728 .addImm(AMDGPU::sub1);
2729 I.eraseFromParent();
2730 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2731 *MRI);
2732 }
2733
2734 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2735 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2736
2737 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2738 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2739 // We need a 64-bit register source, but the high bits don't matter.
2740 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2741 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2742 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2743
2744 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2745 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2746 .addReg(SrcReg, 0, SubReg)
2747 .addImm(AMDGPU::sub0)
2748 .addReg(UndefReg)
2749 .addImm(AMDGPU::sub1);
2750
2751 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2752 .addReg(ExtReg)
2753 .addImm(SrcSize << 16);
2754
2755 I.eraseFromParent();
2756 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2757 }
2758
2759 unsigned Mask;
2760 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2761 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2762 .addReg(SrcReg)
2763 .addImm(Mask)
2764 .setOperandDead(3); // Dead scc
2765 } else {
2766 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2767 .addReg(SrcReg)
2768 .addImm(SrcSize << 16);
2769 }
2770
2771 I.eraseFromParent();
2772 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2773 }
2774
2775 return false;
2776}
2777
2781
2783 Register BitcastSrc;
2784 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2785 Reg = BitcastSrc;
2786 return Reg;
2787}
2788
2790 Register &Out) {
2791 Register Trunc;
2792 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2793 return false;
2794
2795 Register LShlSrc;
2796 Register Cst;
2797 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2798 Cst = stripCopy(Cst, MRI);
2799 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2800 Out = stripBitCast(LShlSrc, MRI);
2801 return true;
2802 }
2803 }
2804
2805 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2806 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2807 return false;
2808
2809 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2810 LLT::fixed_vector(2, 16));
2811
2812 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2813 assert(Mask.size() == 2);
2814
2815 if (Mask[0] == 1 && Mask[1] <= 1) {
2816 Out = Shuffle->getOperand(0).getReg();
2817 return true;
2818 }
2819
2820 return false;
2821}
2822
2823bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2824 if (!Subtarget->hasSALUFloatInsts())
2825 return false;
2826
2827 Register Dst = I.getOperand(0).getReg();
2828 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2829 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2830 return false;
2831
2832 Register Src = I.getOperand(1).getReg();
2833
2834 if (MRI->getType(Dst) == LLT::scalar(32) &&
2835 MRI->getType(Src) == LLT::scalar(16)) {
2836 if (isExtractHiElt(*MRI, Src, Src)) {
2837 MachineBasicBlock *BB = I.getParent();
2838 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2839 .addUse(Src);
2840 I.eraseFromParent();
2841 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2842 }
2843 }
2844
2845 return false;
2846}
2847
2848bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2849 // Only manually handle the f64 SGPR case.
2850 //
2851 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2852 // the bit ops theoretically have a second result due to the implicit def of
2853 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2854 // that is easy by disabling the check. The result works, but uses a
2855 // nonsensical sreg32orlds_and_sreg_1 regclass.
2856 //
2857 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2858 // the variadic REG_SEQUENCE operands.
2859
2860 Register Dst = MI.getOperand(0).getReg();
2861 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2862 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2863 MRI->getType(Dst) != LLT::scalar(64))
2864 return false;
2865
2866 Register Src = MI.getOperand(1).getReg();
2867 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2868 if (Fabs)
2869 Src = Fabs->getOperand(1).getReg();
2870
2871 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2872 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2873 return false;
2874
2875 MachineBasicBlock *BB = MI.getParent();
2876 const DebugLoc &DL = MI.getDebugLoc();
2877 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2878 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2879 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2880 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2881
2882 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2883 .addReg(Src, 0, AMDGPU::sub0);
2884 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2885 .addReg(Src, 0, AMDGPU::sub1);
2886 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2887 .addImm(0x80000000);
2888
2889 // Set or toggle sign bit.
2890 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2891 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2892 .addReg(HiReg)
2893 .addReg(ConstReg)
2894 .setOperandDead(3); // Dead scc
2895 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2896 .addReg(LoReg)
2897 .addImm(AMDGPU::sub0)
2898 .addReg(OpReg)
2899 .addImm(AMDGPU::sub1);
2900 MI.eraseFromParent();
2901 return true;
2902}
2903
2904// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2905bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2906 Register Dst = MI.getOperand(0).getReg();
2907 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2908 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2909 MRI->getType(Dst) != LLT::scalar(64))
2910 return false;
2911
2912 Register Src = MI.getOperand(1).getReg();
2913 MachineBasicBlock *BB = MI.getParent();
2914 const DebugLoc &DL = MI.getDebugLoc();
2915 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2916 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2917 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2918 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2919
2920 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2921 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2922 return false;
2923
2924 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2925 .addReg(Src, 0, AMDGPU::sub0);
2926 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2927 .addReg(Src, 0, AMDGPU::sub1);
2928 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2929 .addImm(0x7fffffff);
2930
2931 // Clear sign bit.
2932 // TODO: Should this used S_BITSET0_*?
2933 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2934 .addReg(HiReg)
2935 .addReg(ConstReg)
2936 .setOperandDead(3); // Dead scc
2937 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2938 .addReg(LoReg)
2939 .addImm(AMDGPU::sub0)
2940 .addReg(OpReg)
2941 .addImm(AMDGPU::sub1);
2942
2943 MI.eraseFromParent();
2944 return true;
2945}
2946
2947static bool isConstant(const MachineInstr &MI) {
2948 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2949}
2950
2951void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2952 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2953
2954 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2955 const MachineInstr *PtrMI =
2956 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2957
2958 assert(PtrMI);
2959
2960 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2961 return;
2962
2963 GEPInfo GEPInfo;
2964
2965 for (unsigned i = 1; i != 3; ++i) {
2966 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2967 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2968 assert(OpDef);
2969 if (i == 2 && isConstant(*OpDef)) {
2970 // TODO: Could handle constant base + variable offset, but a combine
2971 // probably should have commuted it.
2972 assert(GEPInfo.Imm == 0);
2973 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2974 continue;
2975 }
2976 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2977 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2978 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2979 else
2980 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2981 }
2982
2983 AddrInfo.push_back(GEPInfo);
2984 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2985}
2986
2987bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2988 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2989}
2990
2991bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2992 if (!MI.hasOneMemOperand())
2993 return false;
2994
2995 const MachineMemOperand *MMO = *MI.memoperands_begin();
2996 const Value *Ptr = MMO->getValue();
2997
2998 // UndefValue means this is a load of a kernel input. These are uniform.
2999 // Sometimes LDS instructions have constant pointers.
3000 // If Ptr is null, then that means this mem operand contains a
3001 // PseudoSourceValue like GOT.
3003 return true;
3004
3006 return true;
3007
3008 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3009 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3010 AMDGPU::SGPRRegBankID;
3011
3012 const Instruction *I = dyn_cast<Instruction>(Ptr);
3013 return I && I->getMetadata("amdgpu.uniform");
3014}
3015
3016bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3017 for (const GEPInfo &GEPInfo : AddrInfo) {
3018 if (!GEPInfo.VgprParts.empty())
3019 return true;
3020 }
3021 return false;
3022}
3023
3024void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3025 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3026 unsigned AS = PtrTy.getAddressSpace();
3028 STI.ldsRequiresM0Init()) {
3029 MachineBasicBlock *BB = I.getParent();
3030
3031 // If DS instructions require M0 initialization, insert it before selecting.
3032 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3033 .addImm(-1);
3034 }
3035}
3036
3037bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3038 MachineInstr &I) const {
3039 initM0(I);
3040 return selectImpl(I, *CoverageInfo);
3041}
3042
3044 if (Reg.isPhysical())
3045 return false;
3046
3047 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3048 const unsigned Opcode = MI.getOpcode();
3049
3050 if (Opcode == AMDGPU::COPY)
3051 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3052
3053 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3054 Opcode == AMDGPU::G_XOR)
3055 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3056 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3057
3058 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3059 return GI->is(Intrinsic::amdgcn_class);
3060
3061 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3062}
3063
3064bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3065 MachineBasicBlock *BB = I.getParent();
3066 MachineOperand &CondOp = I.getOperand(0);
3067 Register CondReg = CondOp.getReg();
3068 const DebugLoc &DL = I.getDebugLoc();
3069
3070 unsigned BrOpcode;
3071 Register CondPhysReg;
3072 const TargetRegisterClass *ConstrainRC;
3073
3074 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3075 // whether the branch is uniform when selecting the instruction. In
3076 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3077 // RegBankSelect knows what it's doing if the branch condition is scc, even
3078 // though it currently does not.
3079 if (!isVCC(CondReg, *MRI)) {
3080 if (MRI->getType(CondReg) != LLT::scalar(32))
3081 return false;
3082
3083 CondPhysReg = AMDGPU::SCC;
3084 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3085 ConstrainRC = &AMDGPU::SReg_32RegClass;
3086 } else {
3087 // FIXME: Should scc->vcc copies and with exec?
3088
3089 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3090 // need to insert an and with exec.
3091 if (!isVCmpResult(CondReg, *MRI)) {
3092 const bool Is64 = STI.isWave64();
3093 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3094 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3095
3096 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3097 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3098 .addReg(CondReg)
3099 .addReg(Exec)
3100 .setOperandDead(3); // Dead scc
3101 CondReg = TmpReg;
3102 }
3103
3104 CondPhysReg = TRI.getVCC();
3105 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3106 ConstrainRC = TRI.getBoolRC();
3107 }
3108
3109 if (!MRI->getRegClassOrNull(CondReg))
3110 MRI->setRegClass(CondReg, ConstrainRC);
3111
3112 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3113 .addReg(CondReg);
3114 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3115 .addMBB(I.getOperand(1).getMBB());
3116
3117 I.eraseFromParent();
3118 return true;
3119}
3120
3121bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3122 MachineInstr &I) const {
3123 Register DstReg = I.getOperand(0).getReg();
3124 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3125 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3126 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3127 if (IsVGPR)
3128 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3129
3130 return RBI.constrainGenericRegister(
3131 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3132}
3133
3134bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3135 Register DstReg = I.getOperand(0).getReg();
3136 Register SrcReg = I.getOperand(1).getReg();
3137 Register MaskReg = I.getOperand(2).getReg();
3138 LLT Ty = MRI->getType(DstReg);
3139 LLT MaskTy = MRI->getType(MaskReg);
3140 MachineBasicBlock *BB = I.getParent();
3141 const DebugLoc &DL = I.getDebugLoc();
3142
3143 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3144 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3145 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3146 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3147 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3148 return false;
3149
3150 // Try to avoid emitting a bit operation when we only need to touch half of
3151 // the 64-bit pointer.
3152 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3153 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3154 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3155
3156 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3157 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3158
3159 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3160 !CanCopyLow32 && !CanCopyHi32) {
3161 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3162 .addReg(SrcReg)
3163 .addReg(MaskReg)
3164 .setOperandDead(3); // Dead scc
3165 I.eraseFromParent();
3166 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3167 }
3168
3169 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3170 const TargetRegisterClass &RegRC
3171 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3172
3173 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3174 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3175 const TargetRegisterClass *MaskRC =
3176 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3177
3178 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3179 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3180 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3181 return false;
3182
3183 if (Ty.getSizeInBits() == 32) {
3184 assert(MaskTy.getSizeInBits() == 32 &&
3185 "ptrmask should have been narrowed during legalize");
3186
3187 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3188 .addReg(SrcReg)
3189 .addReg(MaskReg);
3190
3191 if (!IsVGPR)
3192 NewOp.setOperandDead(3); // Dead scc
3193 I.eraseFromParent();
3194 return true;
3195 }
3196
3197 Register HiReg = MRI->createVirtualRegister(&RegRC);
3198 Register LoReg = MRI->createVirtualRegister(&RegRC);
3199
3200 // Extract the subregisters from the source pointer.
3201 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3202 .addReg(SrcReg, 0, AMDGPU::sub0);
3203 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3204 .addReg(SrcReg, 0, AMDGPU::sub1);
3205
3206 Register MaskedLo, MaskedHi;
3207
3208 if (CanCopyLow32) {
3209 // If all the bits in the low half are 1, we only need a copy for it.
3210 MaskedLo = LoReg;
3211 } else {
3212 // Extract the mask subregister and apply the and.
3213 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3214 MaskedLo = MRI->createVirtualRegister(&RegRC);
3215
3216 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3217 .addReg(MaskReg, 0, AMDGPU::sub0);
3218 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3219 .addReg(LoReg)
3220 .addReg(MaskLo);
3221 }
3222
3223 if (CanCopyHi32) {
3224 // If all the bits in the high half are 1, we only need a copy for it.
3225 MaskedHi = HiReg;
3226 } else {
3227 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3228 MaskedHi = MRI->createVirtualRegister(&RegRC);
3229
3230 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3231 .addReg(MaskReg, 0, AMDGPU::sub1);
3232 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3233 .addReg(HiReg)
3234 .addReg(MaskHi);
3235 }
3236
3237 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3238 .addReg(MaskedLo)
3239 .addImm(AMDGPU::sub0)
3240 .addReg(MaskedHi)
3241 .addImm(AMDGPU::sub1);
3242 I.eraseFromParent();
3243 return true;
3244}
3245
3246/// Return the register to use for the index value, and the subregister to use
3247/// for the indirectly accessed register.
3248static std::pair<Register, unsigned>
3250 const TargetRegisterClass *SuperRC, Register IdxReg,
3251 unsigned EltSize, GISelValueTracking &ValueTracking) {
3252 Register IdxBaseReg;
3253 int Offset;
3254
3255 std::tie(IdxBaseReg, Offset) =
3256 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3257 if (IdxBaseReg == AMDGPU::NoRegister) {
3258 // This will happen if the index is a known constant. This should ordinarily
3259 // be legalized out, but handle it as a register just in case.
3260 assert(Offset == 0);
3261 IdxBaseReg = IdxReg;
3262 }
3263
3264 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3265
3266 // Skip out of bounds offsets, or else we would end up using an undefined
3267 // register.
3268 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3269 return std::pair(IdxReg, SubRegs[0]);
3270 return std::pair(IdxBaseReg, SubRegs[Offset]);
3271}
3272
3273bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3274 MachineInstr &MI) const {
3275 Register DstReg = MI.getOperand(0).getReg();
3276 Register SrcReg = MI.getOperand(1).getReg();
3277 Register IdxReg = MI.getOperand(2).getReg();
3278
3279 LLT DstTy = MRI->getType(DstReg);
3280 LLT SrcTy = MRI->getType(SrcReg);
3281
3282 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3283 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3284 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3285
3286 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3287 // into a waterfall loop.
3288 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3289 return false;
3290
3291 const TargetRegisterClass *SrcRC =
3292 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3293 const TargetRegisterClass *DstRC =
3294 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3295 if (!SrcRC || !DstRC)
3296 return false;
3297 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3298 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3299 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3300 return false;
3301
3302 MachineBasicBlock *BB = MI.getParent();
3303 const DebugLoc &DL = MI.getDebugLoc();
3304 const bool Is64 = DstTy.getSizeInBits() == 64;
3305
3306 unsigned SubReg;
3307 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3308 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3309
3310 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3311 if (DstTy.getSizeInBits() != 32 && !Is64)
3312 return false;
3313
3314 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3315 .addReg(IdxReg);
3316
3317 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3318 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3319 .addReg(SrcReg, 0, SubReg)
3320 .addReg(SrcReg, RegState::Implicit);
3321 MI.eraseFromParent();
3322 return true;
3323 }
3324
3325 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3326 return false;
3327
3328 if (!STI.useVGPRIndexMode()) {
3329 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3330 .addReg(IdxReg);
3331 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3332 .addReg(SrcReg, 0, SubReg)
3333 .addReg(SrcReg, RegState::Implicit);
3334 MI.eraseFromParent();
3335 return true;
3336 }
3337
3338 const MCInstrDesc &GPRIDXDesc =
3339 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3340 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3341 .addReg(SrcReg)
3342 .addReg(IdxReg)
3343 .addImm(SubReg);
3344
3345 MI.eraseFromParent();
3346 return true;
3347}
3348
3349// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3350bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3351 MachineInstr &MI) const {
3352 Register DstReg = MI.getOperand(0).getReg();
3353 Register VecReg = MI.getOperand(1).getReg();
3354 Register ValReg = MI.getOperand(2).getReg();
3355 Register IdxReg = MI.getOperand(3).getReg();
3356
3357 LLT VecTy = MRI->getType(DstReg);
3358 LLT ValTy = MRI->getType(ValReg);
3359 unsigned VecSize = VecTy.getSizeInBits();
3360 unsigned ValSize = ValTy.getSizeInBits();
3361
3362 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3363 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3364 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3365
3366 assert(VecTy.getElementType() == ValTy);
3367
3368 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3369 // into a waterfall loop.
3370 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3371 return false;
3372
3373 const TargetRegisterClass *VecRC =
3374 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3375 const TargetRegisterClass *ValRC =
3376 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3377
3378 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3379 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3380 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3381 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3382 return false;
3383
3384 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3385 return false;
3386
3387 unsigned SubReg;
3388 std::tie(IdxReg, SubReg) =
3389 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3390
3391 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3392 STI.useVGPRIndexMode();
3393
3394 MachineBasicBlock *BB = MI.getParent();
3395 const DebugLoc &DL = MI.getDebugLoc();
3396
3397 if (!IndexMode) {
3398 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3399 .addReg(IdxReg);
3400
3401 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3402 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3403 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3404 .addReg(VecReg)
3405 .addReg(ValReg)
3406 .addImm(SubReg);
3407 MI.eraseFromParent();
3408 return true;
3409 }
3410
3411 const MCInstrDesc &GPRIDXDesc =
3412 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3413 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3414 .addReg(VecReg)
3415 .addReg(ValReg)
3416 .addReg(IdxReg)
3417 .addImm(SubReg);
3418
3419 MI.eraseFromParent();
3420 return true;
3421}
3422
3423bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3424 if (!Subtarget->hasVMemToLDSLoad())
3425 return false;
3426 unsigned Opc;
3427 unsigned Size = MI.getOperand(3).getImm();
3428
3429 // The struct intrinsic variants add one additional operand over raw.
3430 const bool HasVIndex = MI.getNumOperands() == 9;
3431 Register VIndex;
3432 int OpOffset = 0;
3433 if (HasVIndex) {
3434 VIndex = MI.getOperand(4).getReg();
3435 OpOffset = 1;
3436 }
3437
3438 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3439 std::optional<ValueAndVReg> MaybeVOffset =
3441 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3442
3443 switch (Size) {
3444 default:
3445 return false;
3446 case 1:
3447 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3448 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3449 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3450 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3451 break;
3452 case 2:
3453 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3454 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3455 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3456 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3457 break;
3458 case 4:
3459 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3460 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3461 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3462 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3463 break;
3464 case 12:
3465 if (!Subtarget->hasLDSLoadB96_B128())
3466 return false;
3467
3468 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3469 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3470 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3471 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3472 break;
3473 case 16:
3474 if (!Subtarget->hasLDSLoadB96_B128())
3475 return false;
3476
3477 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3478 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3479 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3480 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3481 break;
3482 }
3483
3484 MachineBasicBlock *MBB = MI.getParent();
3485 const DebugLoc &DL = MI.getDebugLoc();
3486 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3487 .add(MI.getOperand(2));
3488
3489 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3490
3491 if (HasVIndex && HasVOffset) {
3492 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3493 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3494 .addReg(VIndex)
3495 .addImm(AMDGPU::sub0)
3496 .addReg(VOffset)
3497 .addImm(AMDGPU::sub1);
3498
3499 MIB.addReg(IdxReg);
3500 } else if (HasVIndex) {
3501 MIB.addReg(VIndex);
3502 } else if (HasVOffset) {
3503 MIB.addReg(VOffset);
3504 }
3505
3506 MIB.add(MI.getOperand(1)); // rsrc
3507 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3508 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3509 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3510 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3511 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3512 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3513 MIB.addImm(
3514 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3515 ? 1
3516 : 0); // swz
3517
3518 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3519 // Don't set the offset value here because the pointer points to the base of
3520 // the buffer.
3521 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3522
3523 MachinePointerInfo StorePtrI = LoadPtrI;
3524 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3528
3529 auto F = LoadMMO->getFlags() &
3531 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3532 Size, LoadMMO->getBaseAlign());
3533
3534 MachineMemOperand *StoreMMO =
3535 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3536 sizeof(int32_t), LoadMMO->getBaseAlign());
3537
3538 MIB.setMemRefs({LoadMMO, StoreMMO});
3539
3540 MI.eraseFromParent();
3541 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3542}
3543
3544/// Match a zero extend from a 32-bit value to 64-bits.
3545Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3546 Register ZExtSrc;
3547 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3548 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3549
3550 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3551 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3552 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3553 return Register();
3554
3555 assert(Def->getNumOperands() == 3 &&
3556 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3557 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3558 return Def->getOperand(1).getReg();
3559 }
3560
3561 return Register();
3562}
3563
3564/// Match a sign extend from a 32-bit value to 64-bits.
3565Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3566 Register SExtSrc;
3567 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3568 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3569
3570 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3571 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3572 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3573 return Register();
3574
3575 assert(Def->getNumOperands() == 3 &&
3576 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3577 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3578 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3579 m_SpecificICst(31))))
3580 return Def->getOperand(1).getReg();
3581
3582 if (VT->signBitIsZero(Reg))
3583 return matchZeroExtendFromS32(Reg);
3584
3585 return Register();
3586}
3587
3588/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3589/// is 32-bit.
3591AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3592 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3593 : matchZeroExtendFromS32(Reg);
3594}
3595
3596/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3597/// is 32-bit.
3599AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3600 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3601 : matchSignExtendFromS32(Reg);
3602}
3603
3605AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3606 bool IsSigned) const {
3607 if (IsSigned)
3608 return matchSignExtendFromS32OrS32(Reg);
3609
3610 return matchZeroExtendFromS32OrS32(Reg);
3611}
3612
3613Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3614 Register AnyExtSrc;
3615 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3616 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3617
3618 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3619 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3620 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3621 return Register();
3622
3623 assert(Def->getNumOperands() == 3 &&
3624 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3625
3626 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3627 return Def->getOperand(1).getReg();
3628
3629 return Register();
3630}
3631
3632bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3633 if (!Subtarget->hasVMemToLDSLoad())
3634 return false;
3635
3636 unsigned Opc;
3637 unsigned Size = MI.getOperand(3).getImm();
3638
3639 switch (Size) {
3640 default:
3641 return false;
3642 case 1:
3643 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3644 break;
3645 case 2:
3646 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3647 break;
3648 case 4:
3649 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3650 break;
3651 case 12:
3652 if (!Subtarget->hasLDSLoadB96_B128())
3653 return false;
3654 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3655 break;
3656 case 16:
3657 if (!Subtarget->hasLDSLoadB96_B128())
3658 return false;
3659 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3660 break;
3661 }
3662
3663 MachineBasicBlock *MBB = MI.getParent();
3664 const DebugLoc &DL = MI.getDebugLoc();
3665 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3666 .add(MI.getOperand(2));
3667
3668 Register Addr = MI.getOperand(1).getReg();
3669 Register VOffset;
3670 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3671 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3672 if (!isSGPR(Addr)) {
3673 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3674 if (isSGPR(AddrDef->Reg)) {
3675 Addr = AddrDef->Reg;
3676 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3677 Register SAddr =
3678 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3679 if (isSGPR(SAddr)) {
3680 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3681 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3682 Addr = SAddr;
3683 VOffset = Off;
3684 }
3685 }
3686 }
3687 }
3688
3689 if (isSGPR(Addr)) {
3691 if (!VOffset) {
3692 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3693 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3694 .addImm(0);
3695 }
3696 }
3697
3698 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3699 .addReg(Addr);
3700
3701 if (isSGPR(Addr))
3702 MIB.addReg(VOffset);
3703
3704 MIB.add(MI.getOperand(4)); // offset
3705
3706 unsigned Aux = MI.getOperand(5).getImm();
3707 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3708
3709 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3710 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3711 LoadPtrI.Offset = MI.getOperand(4).getImm();
3712 MachinePointerInfo StorePtrI = LoadPtrI;
3713 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3717 auto F = LoadMMO->getFlags() &
3719 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3720 Size, LoadMMO->getBaseAlign());
3721 MachineMemOperand *StoreMMO =
3722 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3723 sizeof(int32_t), Align(4));
3724
3725 MIB.setMemRefs({LoadMMO, StoreMMO});
3726
3727 MI.eraseFromParent();
3728 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3729}
3730
3731bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3732 MachineInstr &MI) const {
3733 unsigned OpcodeOpIdx =
3734 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3735 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3736 MI.removeOperand(OpcodeOpIdx);
3737 MI.addImplicitDefUseOperands(*MI.getMF());
3738 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3739}
3740
3741// FIXME: This should be removed and let the patterns select. We just need the
3742// AGPR/VGPR combination versions.
3743bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3744 unsigned Opc;
3745 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3746 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3747 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3748 break;
3749 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3750 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3751 break;
3752 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3753 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3754 break;
3755 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3756 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3757 break;
3758 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3759 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3760 break;
3761 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3762 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3763 break;
3764 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3765 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3766 break;
3767 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3768 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3769 break;
3770 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3771 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3772 break;
3773 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3774 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3775 break;
3776 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3777 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3778 break;
3779 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3780 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3781 break;
3782 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3783 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3784 break;
3785 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3786 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3787 break;
3788 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3789 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3790 break;
3791 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3792 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3793 break;
3794 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3795 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3796 break;
3797 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3798 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3799 break;
3800 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3801 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3802 break;
3803 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3804 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3805 break;
3806 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3807 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3808 break;
3809 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3810 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3811 break;
3812 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3813 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3814 break;
3815 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3816 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3817 break;
3818 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3819 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3820 break;
3821 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3822 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3823 break;
3824 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3825 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3826 break;
3827 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3828 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3829 break;
3830 default:
3831 llvm_unreachable("unhandled smfmac intrinsic");
3832 }
3833
3834 auto VDst_In = MI.getOperand(4);
3835
3836 MI.setDesc(TII.get(Opc));
3837 MI.removeOperand(4); // VDst_In
3838 MI.removeOperand(1); // Intrinsic ID
3839 MI.addOperand(VDst_In); // Readd VDst_In to the end
3840 MI.addImplicitDefUseOperands(*MI.getMF());
3841 const MCInstrDesc &MCID = MI.getDesc();
3842 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3843 MI.getOperand(0).setIsEarlyClobber(true);
3844 }
3845 return true;
3846}
3847
3848bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3849 MachineInstr &MI, Intrinsic::ID IntrID) const {
3850 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3851 !Subtarget->hasPermlane16Swap())
3852 return false;
3853 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3854 !Subtarget->hasPermlane32Swap())
3855 return false;
3856
3857 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3858 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3859 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3860
3861 MI.removeOperand(2);
3862 MI.setDesc(TII.get(Opcode));
3863 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3864
3865 MachineOperand &FI = MI.getOperand(4);
3867
3868 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3869}
3870
3871bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3872 Register DstReg = MI.getOperand(0).getReg();
3873 Register SrcReg = MI.getOperand(1).getReg();
3874 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3875 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3876 MachineBasicBlock *MBB = MI.getParent();
3877 const DebugLoc &DL = MI.getDebugLoc();
3878
3879 if (IsVALU) {
3880 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3881 .addImm(Subtarget->getWavefrontSizeLog2())
3882 .addReg(SrcReg);
3883 } else {
3884 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3885 .addReg(SrcReg)
3886 .addImm(Subtarget->getWavefrontSizeLog2())
3887 .setOperandDead(3); // Dead scc
3888 }
3889
3890 const TargetRegisterClass &RC =
3891 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3892 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3893 return false;
3894
3895 MI.eraseFromParent();
3896 return true;
3897}
3898
3899bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
3900 MachineInstr &MI) const {
3901 assert(MI.getNumOperands() == 4);
3902 MachineBasicBlock *MBB = MI.getParent();
3903 const DebugLoc &DL = MI.getDebugLoc();
3904
3905 Register DstReg = MI.getOperand(0).getReg();
3906 Register ValReg = MI.getOperand(2).getReg();
3907 Register IdxReg = MI.getOperand(3).getReg();
3908
3909 const LLT DstTy = MRI->getType(DstReg);
3910 unsigned DstSize = DstTy.getSizeInBits();
3911 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3912 const TargetRegisterClass *DstRC =
3913 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
3914
3915 if (DstTy != LLT::scalar(32))
3916 return false;
3917
3918 if (!Subtarget->supportsBPermute())
3919 return false;
3920
3921 // If we can bpermute across the whole wave, then just do that
3922 if (Subtarget->supportsWaveWideBPermute()) {
3923 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
3924 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
3925 .addImm(2)
3926 .addReg(IdxReg);
3927
3928 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
3929 .addReg(ShiftIdxReg)
3930 .addReg(ValReg)
3931 .addImm(0);
3932 } else {
3933 // Otherwise, we need to make use of whole wave mode
3934 assert(Subtarget->isWave64());
3935
3936 // Set inactive lanes to poison
3937 Register UndefValReg =
3938 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
3939 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
3940
3941 Register UndefExecReg = MRI->createVirtualRegister(
3942 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
3943 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
3944
3945 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
3946 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
3947 .addImm(0)
3948 .addReg(ValReg)
3949 .addImm(0)
3950 .addReg(UndefValReg)
3951 .addReg(UndefExecReg);
3952
3953 // ds_bpermute requires index to be multiplied by 4
3954 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
3955 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
3956 .addImm(2)
3957 .addReg(IdxReg);
3958
3959 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
3960 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
3961 .addImm(0)
3962 .addReg(ShiftIdxReg)
3963 .addImm(0)
3964 .addReg(UndefValReg)
3965 .addReg(UndefExecReg);
3966
3967 // Get permutation of each half, then we'll select which one to use
3968 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
3969 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
3970 .addReg(PoisonIdxReg)
3971 .addReg(PoisonValReg)
3972 .addImm(0);
3973
3974 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
3975 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
3976 .addReg(PoisonValReg);
3977
3978 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
3979 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
3980 .addReg(PoisonIdxReg)
3981 .addReg(SwappedValReg)
3982 .addImm(0);
3983
3984 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
3985 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
3986 .addReg(OppSidePermReg);
3987
3988 // Select which side to take the permute from
3989 // We can get away with only using mbcnt_lo here since we're only
3990 // trying to detect which side of 32 each lane is on, and mbcnt_lo
3991 // returns 32 for lanes 32-63.
3992 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
3993 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
3994 .addImm(-1)
3995 .addImm(0);
3996
3997 Register XORReg = MRI->createVirtualRegister(DstRC);
3998 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
3999 .addReg(ThreadIDReg)
4000 .addReg(PoisonIdxReg);
4001
4002 Register ANDReg = MRI->createVirtualRegister(DstRC);
4003 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4004 .addReg(XORReg)
4005 .addImm(32);
4006
4007 Register CompareReg = MRI->createVirtualRegister(
4008 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4009 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4010 .addReg(ANDReg)
4011 .addImm(0);
4012
4013 // Finally do the selection
4014 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4015 .addImm(0)
4016 .addReg(WWMSwapPermReg)
4017 .addImm(0)
4018 .addReg(SameSidePermReg)
4019 .addReg(CompareReg);
4020 }
4021
4022 MI.eraseFromParent();
4023 return true;
4024}
4025
4026// Match BITOP3 operation and return a number of matched instructions plus
4027// truth table.
4028static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4030 const MachineRegisterInfo &MRI) {
4031 unsigned NumOpcodes = 0;
4032 uint8_t LHSBits, RHSBits;
4033
4034 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4035 // Define truth table given Src0, Src1, Src2 bits permutations:
4036 // 0 0 0
4037 // 0 0 1
4038 // 0 1 0
4039 // 0 1 1
4040 // 1 0 0
4041 // 1 0 1
4042 // 1 1 0
4043 // 1 1 1
4044 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4045
4046 if (mi_match(Op, MRI, m_AllOnesInt())) {
4047 Bits = 0xff;
4048 return true;
4049 }
4050 if (mi_match(Op, MRI, m_ZeroInt())) {
4051 Bits = 0;
4052 return true;
4053 }
4054
4055 for (unsigned I = 0; I < Src.size(); ++I) {
4056 // Try to find existing reused operand
4057 if (Src[I] == Op) {
4058 Bits = SrcBits[I];
4059 return true;
4060 }
4061 // Try to replace parent operator
4062 if (Src[I] == R) {
4063 Bits = SrcBits[I];
4064 Src[I] = Op;
4065 return true;
4066 }
4067 }
4068
4069 if (Src.size() == 3) {
4070 // No room left for operands. Try one last time, there can be a 'not' of
4071 // one of our source operands. In this case we can compute the bits
4072 // without growing Src vector.
4073 Register LHS;
4074 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4076 for (unsigned I = 0; I < Src.size(); ++I) {
4077 if (Src[I] == LHS) {
4078 Bits = ~SrcBits[I];
4079 return true;
4080 }
4081 }
4082 }
4083
4084 return false;
4085 }
4086
4087 Bits = SrcBits[Src.size()];
4088 Src.push_back(Op);
4089 return true;
4090 };
4091
4092 MachineInstr *MI = MRI.getVRegDef(R);
4093 switch (MI->getOpcode()) {
4094 case TargetOpcode::G_AND:
4095 case TargetOpcode::G_OR:
4096 case TargetOpcode::G_XOR: {
4097 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4098 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4099
4100 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4101 if (!getOperandBits(LHS, LHSBits) ||
4102 !getOperandBits(RHS, RHSBits)) {
4103 Src = Backup;
4104 return std::make_pair(0, 0);
4105 }
4106
4107 // Recursion is naturally limited by the size of the operand vector.
4108 auto Op = BitOp3_Op(LHS, Src, MRI);
4109 if (Op.first) {
4110 NumOpcodes += Op.first;
4111 LHSBits = Op.second;
4112 }
4113
4114 Op = BitOp3_Op(RHS, Src, MRI);
4115 if (Op.first) {
4116 NumOpcodes += Op.first;
4117 RHSBits = Op.second;
4118 }
4119 break;
4120 }
4121 default:
4122 return std::make_pair(0, 0);
4123 }
4124
4125 uint8_t TTbl;
4126 switch (MI->getOpcode()) {
4127 case TargetOpcode::G_AND:
4128 TTbl = LHSBits & RHSBits;
4129 break;
4130 case TargetOpcode::G_OR:
4131 TTbl = LHSBits | RHSBits;
4132 break;
4133 case TargetOpcode::G_XOR:
4134 TTbl = LHSBits ^ RHSBits;
4135 break;
4136 default:
4137 break;
4138 }
4139
4140 return std::make_pair(NumOpcodes + 1, TTbl);
4141}
4142
4143bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4144 if (!Subtarget->hasBitOp3Insts())
4145 return false;
4146
4147 Register DstReg = MI.getOperand(0).getReg();
4148 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4149 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4150 if (!IsVALU)
4151 return false;
4152
4154 uint8_t TTbl;
4155 unsigned NumOpcodes;
4156
4157 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4158
4159 // Src.empty() case can happen if all operands are all zero or all ones.
4160 // Normally it shall be optimized out before reaching this.
4161 if (NumOpcodes < 2 || Src.empty())
4162 return false;
4163
4164 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4165 if (NumOpcodes == 2 && IsB32) {
4166 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4167 // asm more readable. This cannot be modeled with AddedComplexity because
4168 // selector does not know how many operations did we match.
4169 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4170 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4171 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4172 return false;
4173 } else if (NumOpcodes < 4) {
4174 // For a uniform case threshold should be higher to account for moves
4175 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4176 // in SGPRs and a readtfirstlane after.
4177 return false;
4178 }
4179
4180 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4181 if (!IsB32 && STI.hasTrue16BitInsts())
4182 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4183 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4184 unsigned CBL = STI.getConstantBusLimit(Opc);
4185 MachineBasicBlock *MBB = MI.getParent();
4186 const DebugLoc &DL = MI.getDebugLoc();
4187
4188 for (unsigned I = 0; I < Src.size(); ++I) {
4189 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4190 if (RB->getID() != AMDGPU::SGPRRegBankID)
4191 continue;
4192 if (CBL > 0) {
4193 --CBL;
4194 continue;
4195 }
4196 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4197 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4198 .addReg(Src[I]);
4199 Src[I] = NewReg;
4200 }
4201
4202 // Last operand can be ignored, turning a ternary operation into a binary.
4203 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4204 // 'c' with 'a' here without changing the answer. In some pathological
4205 // cases it should be possible to get an operation with a single operand
4206 // too if optimizer would not catch it.
4207 while (Src.size() < 3)
4208 Src.push_back(Src[0]);
4209
4210 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4211 if (!IsB32)
4212 MIB.addImm(0); // src_mod0
4213 MIB.addReg(Src[0]);
4214 if (!IsB32)
4215 MIB.addImm(0); // src_mod1
4216 MIB.addReg(Src[1]);
4217 if (!IsB32)
4218 MIB.addImm(0); // src_mod2
4219 MIB.addReg(Src[2])
4220 .addImm(TTbl);
4221 if (!IsB32)
4222 MIB.addImm(0); // op_sel
4223
4224 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4225 MI.eraseFromParent();
4226
4227 return true;
4228}
4229
4230bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4231 Register SrcReg = MI.getOperand(0).getReg();
4232 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4233 return false;
4234
4235 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4236 Register SP =
4237 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4238 Register WaveAddr = getWaveAddress(DefMI);
4239 MachineBasicBlock *MBB = MI.getParent();
4240 const DebugLoc &DL = MI.getDebugLoc();
4241
4242 if (!WaveAddr) {
4243 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4244 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4245 .addReg(SrcReg)
4246 .addImm(Subtarget->getWavefrontSizeLog2())
4247 .setOperandDead(3); // Dead scc
4248 }
4249
4250 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4251 .addReg(WaveAddr);
4252
4253 MI.eraseFromParent();
4254 return true;
4255}
4256
4258
4259 if (!I.isPreISelOpcode()) {
4260 if (I.isCopy())
4261 return selectCOPY(I);
4262 return true;
4263 }
4264
4265 switch (I.getOpcode()) {
4266 case TargetOpcode::G_AND:
4267 case TargetOpcode::G_OR:
4268 case TargetOpcode::G_XOR:
4269 if (selectBITOP3(I))
4270 return true;
4271 if (selectImpl(I, *CoverageInfo))
4272 return true;
4273 return selectG_AND_OR_XOR(I);
4274 case TargetOpcode::G_ADD:
4275 case TargetOpcode::G_SUB:
4276 case TargetOpcode::G_PTR_ADD:
4277 if (selectImpl(I, *CoverageInfo))
4278 return true;
4279 return selectG_ADD_SUB(I);
4280 case TargetOpcode::G_UADDO:
4281 case TargetOpcode::G_USUBO:
4282 case TargetOpcode::G_UADDE:
4283 case TargetOpcode::G_USUBE:
4284 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4285 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4286 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4287 return selectG_AMDGPU_MAD_64_32(I);
4288 case TargetOpcode::G_INTTOPTR:
4289 case TargetOpcode::G_BITCAST:
4290 case TargetOpcode::G_PTRTOINT:
4291 case TargetOpcode::G_FREEZE:
4292 return selectCOPY(I);
4293 case TargetOpcode::G_FNEG:
4294 if (selectImpl(I, *CoverageInfo))
4295 return true;
4296 return selectG_FNEG(I);
4297 case TargetOpcode::G_FABS:
4298 if (selectImpl(I, *CoverageInfo))
4299 return true;
4300 return selectG_FABS(I);
4301 case TargetOpcode::G_EXTRACT:
4302 return selectG_EXTRACT(I);
4303 case TargetOpcode::G_MERGE_VALUES:
4304 case TargetOpcode::G_CONCAT_VECTORS:
4305 return selectG_MERGE_VALUES(I);
4306 case TargetOpcode::G_UNMERGE_VALUES:
4307 return selectG_UNMERGE_VALUES(I);
4308 case TargetOpcode::G_BUILD_VECTOR:
4309 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4310 return selectG_BUILD_VECTOR(I);
4311 case TargetOpcode::G_IMPLICIT_DEF:
4312 return selectG_IMPLICIT_DEF(I);
4313 case TargetOpcode::G_INSERT:
4314 return selectG_INSERT(I);
4315 case TargetOpcode::G_INTRINSIC:
4316 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4317 return selectG_INTRINSIC(I);
4318 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4319 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4320 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4321 case TargetOpcode::G_ICMP:
4322 case TargetOpcode::G_FCMP:
4323 if (selectG_ICMP_or_FCMP(I))
4324 return true;
4325 return selectImpl(I, *CoverageInfo);
4326 case TargetOpcode::G_LOAD:
4327 case TargetOpcode::G_ZEXTLOAD:
4328 case TargetOpcode::G_SEXTLOAD:
4329 case TargetOpcode::G_STORE:
4330 case TargetOpcode::G_ATOMIC_CMPXCHG:
4331 case TargetOpcode::G_ATOMICRMW_XCHG:
4332 case TargetOpcode::G_ATOMICRMW_ADD:
4333 case TargetOpcode::G_ATOMICRMW_SUB:
4334 case TargetOpcode::G_ATOMICRMW_AND:
4335 case TargetOpcode::G_ATOMICRMW_OR:
4336 case TargetOpcode::G_ATOMICRMW_XOR:
4337 case TargetOpcode::G_ATOMICRMW_MIN:
4338 case TargetOpcode::G_ATOMICRMW_MAX:
4339 case TargetOpcode::G_ATOMICRMW_UMIN:
4340 case TargetOpcode::G_ATOMICRMW_UMAX:
4341 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4342 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4343 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4344 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4345 case TargetOpcode::G_ATOMICRMW_FADD:
4346 case TargetOpcode::G_ATOMICRMW_FMIN:
4347 case TargetOpcode::G_ATOMICRMW_FMAX:
4348 return selectG_LOAD_STORE_ATOMICRMW(I);
4349 case TargetOpcode::G_SELECT:
4350 return selectG_SELECT(I);
4351 case TargetOpcode::G_TRUNC:
4352 return selectG_TRUNC(I);
4353 case TargetOpcode::G_SEXT:
4354 case TargetOpcode::G_ZEXT:
4355 case TargetOpcode::G_ANYEXT:
4356 case TargetOpcode::G_SEXT_INREG:
4357 // This is a workaround. For extension from type i1, `selectImpl()` uses
4358 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4359 // i1 can only be hold in a SGPR class.
4360 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4361 selectImpl(I, *CoverageInfo))
4362 return true;
4363 return selectG_SZA_EXT(I);
4364 case TargetOpcode::G_FPEXT:
4365 if (selectG_FPEXT(I))
4366 return true;
4367 return selectImpl(I, *CoverageInfo);
4368 case TargetOpcode::G_BRCOND:
4369 return selectG_BRCOND(I);
4370 case TargetOpcode::G_GLOBAL_VALUE:
4371 return selectG_GLOBAL_VALUE(I);
4372 case TargetOpcode::G_PTRMASK:
4373 return selectG_PTRMASK(I);
4374 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4375 return selectG_EXTRACT_VECTOR_ELT(I);
4376 case TargetOpcode::G_INSERT_VECTOR_ELT:
4377 return selectG_INSERT_VECTOR_ELT(I);
4378 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4379 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4380 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4381 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4382 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4383 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4385 assert(Intr && "not an image intrinsic with image pseudo");
4386 return selectImageIntrinsic(I, Intr);
4387 }
4388 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4389 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4390 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4391 return selectBVHIntersectRayIntrinsic(I);
4392 case AMDGPU::G_SBFX:
4393 case AMDGPU::G_UBFX:
4394 return selectG_SBFX_UBFX(I);
4395 case AMDGPU::G_SI_CALL:
4396 I.setDesc(TII.get(AMDGPU::SI_CALL));
4397 return true;
4398 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4399 return selectWaveAddress(I);
4400 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4401 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4402 return true;
4403 }
4404 case AMDGPU::G_STACKRESTORE:
4405 return selectStackRestore(I);
4406 case AMDGPU::G_PHI:
4407 return selectPHI(I);
4408 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4409 return selectCOPY_SCC_VCC(I);
4410 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4411 return selectCOPY_VCC_SCC(I);
4412 case AMDGPU::G_AMDGPU_READANYLANE:
4413 return selectReadAnyLane(I);
4414 case TargetOpcode::G_CONSTANT:
4415 case TargetOpcode::G_FCONSTANT:
4416 default:
4417 return selectImpl(I, *CoverageInfo);
4418 }
4419 return false;
4420}
4421
4423AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4424 return {{
4425 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4426 }};
4427
4428}
4429
4430std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4431 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4432 unsigned Mods = 0;
4433 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4434
4435 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4436 Src = MI->getOperand(1).getReg();
4437 Mods |= SISrcMods::NEG;
4438 MI = getDefIgnoringCopies(Src, *MRI);
4439 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4440 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4441 // denormal mode, but we're implicitly canonicalizing in a source operand.
4442 const ConstantFP *LHS =
4443 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4444 if (LHS && LHS->isZero()) {
4445 Mods |= SISrcMods::NEG;
4446 Src = MI->getOperand(2).getReg();
4447 }
4448 }
4449
4450 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4451 Src = MI->getOperand(1).getReg();
4452 Mods |= SISrcMods::ABS;
4453 }
4454
4455 if (OpSel)
4456 Mods |= SISrcMods::OP_SEL_0;
4457
4458 return std::pair(Src, Mods);
4459}
4460
4461Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4462 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4463 bool ForceVGPR) const {
4464 if ((Mods != 0 || ForceVGPR) &&
4465 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4466
4467 // If we looked through copies to find source modifiers on an SGPR operand,
4468 // we now have an SGPR register source. To avoid potentially violating the
4469 // constant bus restriction, we need to insert a copy to a VGPR.
4470 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4471 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4472 TII.get(AMDGPU::COPY), VGPRSrc)
4473 .addReg(Src);
4474 Src = VGPRSrc;
4475 }
4476
4477 return Src;
4478}
4479
4480///
4481/// This will select either an SGPR or VGPR operand and will save us from
4482/// having to write an extra tablegen pattern.
4484AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4485 return {{
4486 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4487 }};
4488}
4489
4491AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4492 Register Src;
4493 unsigned Mods;
4494 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4495
4496 return {{
4497 [=](MachineInstrBuilder &MIB) {
4498 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4499 },
4500 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4501 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4502 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4503 }};
4504}
4505
4507AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4508 Register Src;
4509 unsigned Mods;
4510 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4511 /*IsCanonicalizing=*/true,
4512 /*AllowAbs=*/false);
4513
4514 return {{
4515 [=](MachineInstrBuilder &MIB) {
4516 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4517 },
4518 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4519 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4520 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4521 }};
4522}
4523
4525AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4526 return {{
4527 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4528 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4529 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4530 }};
4531}
4532
4534AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4535 Register Src;
4536 unsigned Mods;
4537 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4538
4539 return {{
4540 [=](MachineInstrBuilder &MIB) {
4541 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4542 },
4543 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4544 }};
4545}
4546
4548AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4549 MachineOperand &Root) const {
4550 Register Src;
4551 unsigned Mods;
4552 std::tie(Src, Mods) =
4553 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4554
4555 return {{
4556 [=](MachineInstrBuilder &MIB) {
4557 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4558 },
4559 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4560 }};
4561}
4562
4564AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4565 Register Src;
4566 unsigned Mods;
4567 std::tie(Src, Mods) =
4568 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4569 /*AllowAbs=*/false);
4570
4571 return {{
4572 [=](MachineInstrBuilder &MIB) {
4573 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4574 },
4575 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4576 }};
4577}
4578
4580AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4581 Register Reg = Root.getReg();
4582 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4583 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4584 return {};
4585 return {{
4586 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4587 }};
4588}
4589
4590enum class SrcStatus {
4595 // This means current op = [op_upper, op_lower] and src = -op_lower.
4598 // This means current op = [op_upper, op_lower] and src = [op_upper,
4599 // -op_lower].
4607};
4608/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4609static bool isTruncHalf(const MachineInstr *MI,
4610 const MachineRegisterInfo &MRI) {
4611 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4612 return false;
4613
4614 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4615 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4616 return DstSize * 2 == SrcSize;
4617}
4618
4619/// Test if the MI is logic shift right with half bits,
4620/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4621static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4622 if (MI->getOpcode() != AMDGPU::G_LSHR)
4623 return false;
4624
4625 Register ShiftSrc;
4626 std::optional<ValueAndVReg> ShiftAmt;
4627 if (mi_match(MI->getOperand(0).getReg(), MRI,
4628 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4629 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4630 unsigned Shift = ShiftAmt->Value.getZExtValue();
4631 return Shift * 2 == SrcSize;
4632 }
4633 return false;
4634}
4635
4636/// Test if the MI is shift left with half bits,
4637/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4638static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4639 if (MI->getOpcode() != AMDGPU::G_SHL)
4640 return false;
4641
4642 Register ShiftSrc;
4643 std::optional<ValueAndVReg> ShiftAmt;
4644 if (mi_match(MI->getOperand(0).getReg(), MRI,
4645 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4646 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4647 unsigned Shift = ShiftAmt->Value.getZExtValue();
4648 return Shift * 2 == SrcSize;
4649 }
4650 return false;
4651}
4652
4653/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4654static bool isUnmergeHalf(const MachineInstr *MI,
4655 const MachineRegisterInfo &MRI) {
4656 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4657 return false;
4658 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4659 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4660}
4661
4663
4665 const MachineRegisterInfo &MRI) {
4666 LLT OpTy = MRI.getType(Reg);
4667 if (OpTy.isScalar())
4668 return TypeClass::SCALAR;
4669 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4672}
4673
4675 const MachineRegisterInfo &MRI) {
4677 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4678 return SrcStatus::INVALID;
4679
4680 switch (S) {
4681 case SrcStatus::IS_SAME:
4682 if (NegType == TypeClass::VECTOR_OF_TWO) {
4683 // Vector of 2:
4684 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4685 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4686 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4687 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4689 }
4690 if (NegType == TypeClass::SCALAR) {
4691 // Scalar:
4692 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4693 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4694 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4695 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4696 return SrcStatus::IS_HI_NEG;
4697 }
4698 break;
4700 if (NegType == TypeClass::VECTOR_OF_TWO) {
4701 // Vector of 2:
4702 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4703 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4704 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4705 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4706 return SrcStatus::IS_LO_NEG;
4707 }
4708 if (NegType == TypeClass::SCALAR) {
4709 // Scalar:
4710 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4711 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4712 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4713 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4714 return SrcStatus::IS_SAME;
4715 }
4716 break;
4718 if (NegType == TypeClass::VECTOR_OF_TWO) {
4719 // Vector of 2:
4720 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4721 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4722 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4723 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4724 return SrcStatus::IS_HI_NEG;
4725 }
4726 if (NegType == TypeClass::SCALAR) {
4727 // Scalar:
4728 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4729 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4730 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4731 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4733 }
4734 break;
4736 if (NegType == TypeClass::VECTOR_OF_TWO) {
4737 // Vector of 2:
4738 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4739 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4740 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4741 // [SrcHi, SrcLo] = [OpHi, OpLo]
4742 return SrcStatus::IS_SAME;
4743 }
4744 if (NegType == TypeClass::SCALAR) {
4745 // Scalar:
4746 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4747 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4748 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4749 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4750 return SrcStatus::IS_LO_NEG;
4751 }
4752 break;
4754 // Vector of 2:
4755 // Src = CurrUpper
4756 // Curr = [CurrUpper, CurrLower]
4757 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4758 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4759 // Src = -OpUpper
4760 //
4761 // Scalar:
4762 // Src = CurrUpper
4763 // Curr = [CurrUpper, CurrLower]
4764 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4765 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4766 // Src = -OpUpper
4769 if (NegType == TypeClass::VECTOR_OF_TWO) {
4770 // Vector of 2:
4771 // Src = CurrLower
4772 // Curr = [CurrUpper, CurrLower]
4773 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4774 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4775 // Src = -OpLower
4777 }
4778 if (NegType == TypeClass::SCALAR) {
4779 // Scalar:
4780 // Src = CurrLower
4781 // Curr = [CurrUpper, CurrLower]
4782 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4783 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4784 // Src = OpLower
4786 }
4787 break;
4789 // Vector of 2:
4790 // Src = -CurrUpper
4791 // Curr = [CurrUpper, CurrLower]
4792 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4793 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4794 // Src = -(-OpUpper) = OpUpper
4795 //
4796 // Scalar:
4797 // Src = -CurrUpper
4798 // Curr = [CurrUpper, CurrLower]
4799 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4800 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4801 // Src = -(-OpUpper) = OpUpper
4804 if (NegType == TypeClass::VECTOR_OF_TWO) {
4805 // Vector of 2:
4806 // Src = -CurrLower
4807 // Curr = [CurrUpper, CurrLower]
4808 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4809 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4810 // Src = -(-OpLower) = OpLower
4812 }
4813 if (NegType == TypeClass::SCALAR) {
4814 // Scalar:
4815 // Src = -CurrLower
4816 // Curr = [CurrUpper, CurrLower]
4817 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4818 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4819 // Src = -OpLower
4821 }
4822 break;
4823 default:
4824 break;
4825 }
4826 llvm_unreachable("unexpected SrcStatus & NegType combination");
4827}
4828
4829static std::optional<std::pair<Register, SrcStatus>>
4830calcNextStatus(std::pair<Register, SrcStatus> Curr,
4831 const MachineRegisterInfo &MRI) {
4832 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4833
4834 unsigned Opc = MI->getOpcode();
4835
4836 // Handle general Opc cases.
4837 switch (Opc) {
4838 case AMDGPU::G_BITCAST:
4839 return std::optional<std::pair<Register, SrcStatus>>(
4840 {MI->getOperand(1).getReg(), Curr.second});
4841 case AMDGPU::COPY:
4842 if (MI->getOperand(1).getReg().isPhysical())
4843 return std::nullopt;
4844 return std::optional<std::pair<Register, SrcStatus>>(
4845 {MI->getOperand(1).getReg(), Curr.second});
4846 case AMDGPU::G_FNEG: {
4847 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4848 if (Stat == SrcStatus::INVALID)
4849 return std::nullopt;
4850 return std::optional<std::pair<Register, SrcStatus>>(
4851 {MI->getOperand(1).getReg(), Stat});
4852 }
4853 default:
4854 break;
4855 }
4856
4857 // Calc next Stat from current Stat.
4858 switch (Curr.second) {
4859 case SrcStatus::IS_SAME:
4860 if (isTruncHalf(MI, MRI))
4861 return std::optional<std::pair<Register, SrcStatus>>(
4862 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4863 else if (isUnmergeHalf(MI, MRI)) {
4864 if (Curr.first == MI->getOperand(0).getReg())
4865 return std::optional<std::pair<Register, SrcStatus>>(
4866 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4867 return std::optional<std::pair<Register, SrcStatus>>(
4868 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4869 }
4870 break;
4872 if (isTruncHalf(MI, MRI)) {
4873 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4874 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4875 // = [OpLowerHi, OpLowerLo]
4876 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4877 // = [-OpLowerHi, OpLowerLo]
4878 // = -OpLower
4879 return std::optional<std::pair<Register, SrcStatus>>(
4880 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4881 }
4882 if (isUnmergeHalf(MI, MRI)) {
4883 if (Curr.first == MI->getOperand(0).getReg())
4884 return std::optional<std::pair<Register, SrcStatus>>(
4885 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4886 return std::optional<std::pair<Register, SrcStatus>>(
4887 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4888 }
4889 break;
4891 if (isShlHalf(MI, MRI))
4892 return std::optional<std::pair<Register, SrcStatus>>(
4893 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4894 break;
4896 if (isLshrHalf(MI, MRI))
4897 return std::optional<std::pair<Register, SrcStatus>>(
4898 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4899 break;
4901 if (isShlHalf(MI, MRI))
4902 return std::optional<std::pair<Register, SrcStatus>>(
4903 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4904 break;
4906 if (isLshrHalf(MI, MRI))
4907 return std::optional<std::pair<Register, SrcStatus>>(
4908 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4909 break;
4910 default:
4911 break;
4912 }
4913 return std::nullopt;
4914}
4915
4916/// This is used to control valid status that current MI supports. For example,
4917/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4918/// bit on VOP3P.
4919/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4920/// for different MI on different arch
4922private:
4923 bool HasNeg = false;
4924 // Assume all complex pattern of VOP3P have opsel.
4925 bool HasOpsel = true;
4926
4927public:
4929 const MachineInstr *MI = MRI.getVRegDef(Reg);
4930 unsigned Opc = MI->getOpcode();
4931
4932 if (Opc < TargetOpcode::GENERIC_OP_END) {
4933 // Keep same for generic op.
4934 HasNeg = true;
4935 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4936 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4937 // Only float point intrinsic has neg & neg_hi bits.
4938 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4939 HasNeg = true;
4940 }
4941 }
4942 bool checkOptions(SrcStatus Stat) const {
4943 if (!HasNeg &&
4944 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4945 return false;
4946 }
4947 if (!HasOpsel &&
4948 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4949 return false;
4950 }
4951 return true;
4952 }
4953};
4954
4957 int MaxDepth = 3) {
4958 int Depth = 0;
4959 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4961
4962 while (Depth <= MaxDepth && Curr.has_value()) {
4963 Depth++;
4964 if (SO.checkOptions(Curr.value().second))
4965 Statlist.push_back(Curr.value());
4966 Curr = calcNextStatus(Curr.value(), MRI);
4967 }
4968
4969 return Statlist;
4970}
4971
4972static std::pair<Register, SrcStatus>
4974 int MaxDepth = 3) {
4975 int Depth = 0;
4976 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4977 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4978
4979 while (Depth <= MaxDepth && Curr.has_value()) {
4980 Depth++;
4981 SrcStatus Stat = Curr.value().second;
4982 if (SO.checkOptions(Stat)) {
4983 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4985 LastSameOrNeg = Curr.value();
4986 }
4987 Curr = calcNextStatus(Curr.value(), MRI);
4988 }
4989
4990 return LastSameOrNeg;
4991}
4992
4993static bool isSameBitWidth(Register Reg1, Register Reg2,
4994 const MachineRegisterInfo &MRI) {
4995 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4996 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4997 return Width1 == Width2;
4998}
4999
5000static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5001 // SrcStatus::IS_LOWER_HALF remain 0.
5002 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5003 Mods ^= SISrcMods::NEG_HI;
5004 Mods |= SISrcMods::OP_SEL_1;
5005 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5006 Mods |= SISrcMods::OP_SEL_1;
5007 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5008 Mods ^= SISrcMods::NEG_HI;
5009 else if (HiStat == SrcStatus::IS_HI_NEG)
5010 Mods ^= SISrcMods::NEG_HI;
5011
5012 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5013 Mods ^= SISrcMods::NEG;
5014 Mods |= SISrcMods::OP_SEL_0;
5015 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5016 Mods |= SISrcMods::OP_SEL_0;
5017 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5018 Mods |= SISrcMods::NEG;
5019 else if (LoStat == SrcStatus::IS_HI_NEG)
5020 Mods ^= SISrcMods::NEG;
5021
5022 return Mods;
5023}
5024
5025static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5026 Register RootReg, const SIInstrInfo &TII,
5027 const MachineRegisterInfo &MRI) {
5028 auto IsHalfState = [](SrcStatus S) {
5031 };
5032 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5033 IsHalfState(HiStat);
5034}
5035
5036std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5037 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5038 unsigned Mods = 0;
5039 // No modification if Root type is not form of <2 x Type>.
5040 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5041 Mods |= SISrcMods::OP_SEL_1;
5042 return {RootReg, Mods};
5043 }
5044
5045 SearchOptions SO(RootReg, MRI);
5046
5047 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5048
5049 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5051 else if (Stat.second == SrcStatus::IS_HI_NEG)
5052 Mods ^= SISrcMods::NEG_HI;
5053 else if (Stat.second == SrcStatus::IS_LO_NEG)
5054 Mods ^= SISrcMods::NEG;
5055
5056 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5057
5058 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5059 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5060 Mods |= SISrcMods::OP_SEL_1;
5061 return {Stat.first, Mods};
5062 }
5063
5065 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5066
5067 if (StatlistHi.empty()) {
5068 Mods |= SISrcMods::OP_SEL_1;
5069 return {Stat.first, Mods};
5070 }
5071
5073 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5074
5075 if (StatlistLo.empty()) {
5076 Mods |= SISrcMods::OP_SEL_1;
5077 return {Stat.first, Mods};
5078 }
5079
5080 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5081 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5082 if (StatlistHi[I].first == StatlistLo[J].first &&
5083 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5084 StatlistHi[I].first, RootReg, TII, MRI))
5085 return {StatlistHi[I].first,
5086 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5087 }
5088 }
5089 // Packed instructions do not have abs modifiers.
5090 Mods |= SISrcMods::OP_SEL_1;
5091
5092 return {Stat.first, Mods};
5093}
5094
5095// Removed unused function `getAllKindImm` to eliminate dead code.
5096
5097static bool checkRB(Register Reg, unsigned int RBNo,
5098 const AMDGPURegisterBankInfo &RBI,
5099 const MachineRegisterInfo &MRI,
5100 const TargetRegisterInfo &TRI) {
5101 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5102 return RB->getID() == RBNo;
5103}
5104
5105// This function is used to get the correct register bank for returned reg.
5106// Assume:
5107// 1. VOP3P is always legal for VGPR.
5108// 2. RootOp's regbank is legal.
5109// Thus
5110// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5111// 2. If RootOp is VGPR, then NewOp must be VGPR.
5113 const AMDGPURegisterBankInfo &RBI,
5115 const TargetRegisterInfo &TRI,
5116 const SIInstrInfo &TII) {
5117 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5118 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5119 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5120 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5121 return NewReg;
5122
5123 MachineInstr *MI = MRI.getVRegDef(RootReg);
5124 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5125 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5126 return RootReg;
5127 }
5128
5129 MachineBasicBlock *BB = MI->getParent();
5130 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5131
5133 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5134 .addReg(NewReg);
5135
5136 // Only accept VGPR.
5137 return MIB->getOperand(0).getReg();
5138}
5139
5141AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5142 bool IsDOT) const {
5143 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5144 Register Reg;
5145 unsigned Mods;
5146 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5147
5148 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5149 return {{
5150 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5151 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5152 }};
5153}
5154
5156AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5157
5158 return selectVOP3PRetHelper(Root);
5159}
5160
5162AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5163
5164 return selectVOP3PRetHelper(Root, true);
5165}
5166
5168AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5169 MachineOperand &Root) const {
5170 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5171 "expected i1 value");
5172 unsigned Mods = SISrcMods::OP_SEL_1;
5173 if (Root.getImm() != 0)
5174 Mods |= SISrcMods::OP_SEL_0;
5175
5176 return {{
5177 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5178 }};
5179}
5180
5182 MachineInstr *InsertPt,
5184 const TargetRegisterClass *DstRegClass;
5185 switch (Elts.size()) {
5186 case 8:
5187 DstRegClass = &AMDGPU::VReg_256RegClass;
5188 break;
5189 case 4:
5190 DstRegClass = &AMDGPU::VReg_128RegClass;
5191 break;
5192 case 2:
5193 DstRegClass = &AMDGPU::VReg_64RegClass;
5194 break;
5195 default:
5196 llvm_unreachable("unhandled Reg sequence size");
5197 }
5198
5199 MachineIRBuilder B(*InsertPt);
5200 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5201 .addDef(MRI.createVirtualRegister(DstRegClass));
5202 for (unsigned i = 0; i < Elts.size(); ++i) {
5203 MIB.addReg(Elts[i]);
5205 }
5206 return MIB->getOperand(0).getReg();
5207}
5208
5209static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5211 MachineInstr *InsertPt,
5213 if (ModOpcode == TargetOpcode::G_FNEG) {
5214 Mods |= SISrcMods::NEG;
5215 // Check if all elements also have abs modifier
5216 SmallVector<Register, 8> NegAbsElts;
5217 for (auto El : Elts) {
5218 Register FabsSrc;
5219 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5220 break;
5221 NegAbsElts.push_back(FabsSrc);
5222 }
5223 if (Elts.size() != NegAbsElts.size()) {
5224 // Neg
5225 Src = buildRegSequence(Elts, InsertPt, MRI);
5226 } else {
5227 // Neg and Abs
5228 Mods |= SISrcMods::NEG_HI;
5229 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5230 }
5231 } else {
5232 assert(ModOpcode == TargetOpcode::G_FABS);
5233 // Abs
5234 Mods |= SISrcMods::NEG_HI;
5235 Src = buildRegSequence(Elts, InsertPt, MRI);
5236 }
5237}
5238
5240AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5241 Register Src = Root.getReg();
5242 unsigned Mods = SISrcMods::OP_SEL_1;
5244
5245 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5246 assert(BV->getNumSources() > 0);
5247 // Based on first element decide which mod we match, neg or abs
5248 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5249 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5250 ? AMDGPU::G_FNEG
5251 : AMDGPU::G_FABS;
5252 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5253 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5254 if (ElF32->getOpcode() != ModOpcode)
5255 break;
5256 EltsF32.push_back(ElF32->getOperand(1).getReg());
5257 }
5258
5259 // All elements had ModOpcode modifier
5260 if (BV->getNumSources() == EltsF32.size()) {
5261 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5262 *MRI);
5263 }
5264 }
5265
5266 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5267 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5268}
5269
5271AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5272 Register Src = Root.getReg();
5273 unsigned Mods = SISrcMods::OP_SEL_1;
5274 SmallVector<Register, 8> EltsV2F16;
5275
5276 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5277 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5278 Register FNegSrc;
5279 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5280 break;
5281 EltsV2F16.push_back(FNegSrc);
5282 }
5283
5284 // All elements had ModOpcode modifier
5285 if (CV->getNumSources() == EltsV2F16.size()) {
5286 Mods |= SISrcMods::NEG;
5287 Mods |= SISrcMods::NEG_HI;
5288 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5289 }
5290 }
5291
5292 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5293 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5294}
5295
5297AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5298 Register Src = Root.getReg();
5299 unsigned Mods = SISrcMods::OP_SEL_1;
5300 SmallVector<Register, 8> EltsV2F16;
5301
5302 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5303 assert(CV->getNumSources() > 0);
5304 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5305 // Based on first element decide which mod we match, neg or abs
5306 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5307 ? AMDGPU::G_FNEG
5308 : AMDGPU::G_FABS;
5309
5310 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5311 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5312 if (ElV2F16->getOpcode() != ModOpcode)
5313 break;
5314 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5315 }
5316
5317 // All elements had ModOpcode modifier
5318 if (CV->getNumSources() == EltsV2F16.size()) {
5319 MachineIRBuilder B(*Root.getParent());
5320 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5321 *MRI);
5322 }
5323 }
5324
5325 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5326 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5327}
5328
5330AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5331 std::optional<FPValueAndVReg> FPValReg;
5332 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5333 if (TII.isInlineConstant(FPValReg->Value)) {
5334 return {{[=](MachineInstrBuilder &MIB) {
5335 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5336 }}};
5337 }
5338 // Non-inlineable splat floats should not fall-through for integer immediate
5339 // checks.
5340 return {};
5341 }
5342
5343 APInt ICst;
5344 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5345 if (TII.isInlineConstant(ICst)) {
5346 return {
5347 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5348 }
5349 }
5350
5351 return {};
5352}
5353
5355AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5356 Register Src =
5357 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5358 unsigned Key = 0;
5359
5360 Register ShiftSrc;
5361 std::optional<ValueAndVReg> ShiftAmt;
5362 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5363 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5364 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5365 Key = ShiftAmt->Value.getZExtValue() / 8;
5366 Src = ShiftSrc;
5367 }
5368
5369 return {{
5370 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5371 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5372 }};
5373}
5374
5376AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5377
5378 Register Src =
5379 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5380 unsigned Key = 0;
5381
5382 Register ShiftSrc;
5383 std::optional<ValueAndVReg> ShiftAmt;
5384 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5385 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5386 ShiftAmt->Value.getZExtValue() == 16) {
5387 Src = ShiftSrc;
5388 Key = 1;
5389 }
5390
5391 return {{
5392 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5393 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5394 }};
5395}
5396
5398AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5399 Register Src =
5400 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5401 unsigned Key = 0;
5402
5403 Register S32 = matchZeroExtendFromS32(Src);
5404 if (!S32)
5405 S32 = matchAnyExtendFromS32(Src);
5406
5407 if (S32) {
5408 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5409 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5410 assert(Def->getNumOperands() == 3);
5411 Register DstReg1 = Def->getOperand(1).getReg();
5412 if (mi_match(S32, *MRI,
5413 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5414 Src = Def->getOperand(2).getReg();
5415 Key = 1;
5416 }
5417 }
5418 }
5419
5420 return {{
5421 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5422 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5423 }};
5424}
5425
5427AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5428 Register Src;
5429 unsigned Mods;
5430 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5431
5432 // FIXME: Handle op_sel
5433 return {{
5434 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5435 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5436 }};
5437}
5438
5439// FIXME-TRUE16 remove when fake16 is removed
5441AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5442 Register Src;
5443 unsigned Mods;
5444 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5445 /*IsCanonicalizing=*/true,
5446 /*AllowAbs=*/false,
5447 /*OpSel=*/false);
5448
5449 return {{
5450 [=](MachineInstrBuilder &MIB) {
5451 MIB.addReg(
5452 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5453 },
5454 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5455 }};
5456}
5457
5459AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5460 Register Src;
5461 unsigned Mods;
5462 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5463 /*IsCanonicalizing=*/true,
5464 /*AllowAbs=*/false,
5465 /*OpSel=*/true);
5466
5467 return {{
5468 [=](MachineInstrBuilder &MIB) {
5469 MIB.addReg(
5470 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5471 },
5472 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5473 }};
5474}
5475
5476// Given \p Offset and load specified by the \p Root operand check if \p Offset
5477// is a multiple of the load byte size. If it is update \p Offset to a
5478// pre-scaled value and return true.
5479bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5481 bool IsSigned) const {
5482 if (!Subtarget->hasScaleOffset())
5483 return false;
5484
5485 const MachineInstr &MI = *Root.getParent();
5486 MachineMemOperand *MMO = *MI.memoperands_begin();
5487
5488 if (!MMO->getSize().hasValue())
5489 return false;
5490
5491 uint64_t Size = MMO->getSize().getValue();
5492
5493 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5494 if (!OffsetReg)
5495 OffsetReg = Offset;
5496
5497 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5498 OffsetReg = Def->Reg;
5499
5500 Register Op0;
5501 MachineInstr *Mul;
5502 bool ScaleOffset =
5503 (isPowerOf2_64(Size) &&
5504 mi_match(OffsetReg, *MRI,
5505 m_GShl(m_Reg(Op0),
5508 mi_match(OffsetReg, *MRI,
5510 m_Copy(m_SpecificICst(Size))))) ||
5511 mi_match(
5512 OffsetReg, *MRI,
5513 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5514 m_Reg(Op0), m_SpecificICst(Size))) ||
5515 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5516 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5517 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5518 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5519 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5520 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5521 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5522 mi_match(Mul->getOperand(3).getReg(), *MRI,
5524 m_Copy(m_SpecificICst(Size))))) &&
5525 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5526
5527 if (ScaleOffset)
5528 Offset = Op0;
5529
5530 return ScaleOffset;
5531}
5532
5533bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5534 Register &Base,
5535 Register *SOffset,
5536 int64_t *Offset,
5537 bool *ScaleOffset) const {
5538 MachineInstr *MI = Root.getParent();
5539 MachineBasicBlock *MBB = MI->getParent();
5540
5541 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5542 // then we can select all ptr + 32-bit offsets.
5543 SmallVector<GEPInfo, 4> AddrInfo;
5544 getAddrModeInfo(*MI, *MRI, AddrInfo);
5545
5546 if (AddrInfo.empty())
5547 return false;
5548
5549 const GEPInfo &GEPI = AddrInfo[0];
5550 std::optional<int64_t> EncodedImm;
5551
5552 if (ScaleOffset)
5553 *ScaleOffset = false;
5554
5555 if (SOffset && Offset) {
5556 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5557 /*HasSOffset=*/true);
5558 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5559 AddrInfo.size() > 1) {
5560 const GEPInfo &GEPI2 = AddrInfo[1];
5561 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5562 Register OffsetReg = GEPI2.SgprParts[1];
5563 if (ScaleOffset)
5564 *ScaleOffset =
5565 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5566 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5567 if (OffsetReg) {
5568 Base = GEPI2.SgprParts[0];
5569 *SOffset = OffsetReg;
5570 *Offset = *EncodedImm;
5571 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5572 return true;
5573
5574 // For unbuffered smem loads, it is illegal for the Immediate Offset
5575 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5576 // is negative. Handle the case where the Immediate Offset + SOffset
5577 // is negative.
5578 auto SKnown = VT->getKnownBits(*SOffset);
5579 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5580 return false;
5581
5582 return true;
5583 }
5584 }
5585 }
5586 return false;
5587 }
5588
5589 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5590 /*HasSOffset=*/false);
5591 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5592 Base = GEPI.SgprParts[0];
5593 *Offset = *EncodedImm;
5594 return true;
5595 }
5596
5597 // SGPR offset is unsigned.
5598 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5599 GEPI.Imm != 0) {
5600 // If we make it this far we have a load with an 32-bit immediate offset.
5601 // It is OK to select this using a sgpr offset, because we have already
5602 // failed trying to select this load into one of the _IMM variants since
5603 // the _IMM Patterns are considered before the _SGPR patterns.
5604 Base = GEPI.SgprParts[0];
5605 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5606 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5607 .addImm(GEPI.Imm);
5608 return true;
5609 }
5610
5611 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5612 Register OffsetReg = GEPI.SgprParts[1];
5613 if (ScaleOffset)
5614 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5615 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5616 if (OffsetReg) {
5617 Base = GEPI.SgprParts[0];
5618 *SOffset = OffsetReg;
5619 return true;
5620 }
5621 }
5622
5623 return false;
5624}
5625
5627AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5628 Register Base;
5629 int64_t Offset;
5630 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5631 /* ScaleOffset */ nullptr))
5632 return std::nullopt;
5633
5634 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5635 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5636}
5637
5639AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5640 SmallVector<GEPInfo, 4> AddrInfo;
5641 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5642
5643 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5644 return std::nullopt;
5645
5646 const GEPInfo &GEPInfo = AddrInfo[0];
5647 Register PtrReg = GEPInfo.SgprParts[0];
5648 std::optional<int64_t> EncodedImm =
5649 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5650 if (!EncodedImm)
5651 return std::nullopt;
5652
5653 return {{
5654 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5655 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5656 }};
5657}
5658
5660AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5661 Register Base, SOffset;
5662 bool ScaleOffset;
5663 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5664 &ScaleOffset))
5665 return std::nullopt;
5666
5667 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5668 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5669 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5670 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5671}
5672
5674AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5675 Register Base, SOffset;
5676 int64_t Offset;
5677 bool ScaleOffset;
5678 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5679 return std::nullopt;
5680
5681 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5682 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5683 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5684 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5685 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5686}
5687
5688std::pair<Register, int>
5689AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5690 uint64_t FlatVariant) const {
5691 MachineInstr *MI = Root.getParent();
5692
5693 auto Default = std::pair(Root.getReg(), 0);
5694
5695 if (!STI.hasFlatInstOffsets())
5696 return Default;
5697
5698 Register PtrBase;
5699 int64_t ConstOffset;
5700 bool IsInBounds;
5701 std::tie(PtrBase, ConstOffset, IsInBounds) =
5702 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5703
5704 // Adding the offset to the base address with an immediate in a FLAT
5705 // instruction must not change the memory aperture in which the address falls.
5706 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5707 // instructions.
5708 if (ConstOffset == 0 ||
5709 (FlatVariant == SIInstrFlags::FlatScratch &&
5710 !isFlatScratchBaseLegal(Root.getReg())) ||
5711 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5712 return Default;
5713
5714 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5715 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5716 return Default;
5717
5718 return std::pair(PtrBase, ConstOffset);
5719}
5720
5722AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5723 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5724
5725 return {{
5726 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5727 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5728 }};
5729}
5730
5732AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5733 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5734
5735 return {{
5736 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5737 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5738 }};
5739}
5740
5742AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5743 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5744
5745 return {{
5746 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5747 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5748 }};
5749}
5750
5751// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5753AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5754 unsigned CPolBits,
5755 bool NeedIOffset) const {
5756 Register Addr = Root.getReg();
5757 Register PtrBase;
5758 int64_t ConstOffset;
5759 int64_t ImmOffset = 0;
5760
5761 // Match the immediate offset first, which canonically is moved as low as
5762 // possible.
5763 std::tie(PtrBase, ConstOffset, std::ignore) =
5764 getPtrBaseWithConstantOffset(Addr, *MRI);
5765
5766 if (ConstOffset != 0) {
5767 if (NeedIOffset &&
5768 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5770 Addr = PtrBase;
5771 ImmOffset = ConstOffset;
5772 } else {
5773 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5774 if (isSGPR(PtrBaseDef->Reg)) {
5775 if (ConstOffset > 0) {
5776 // Offset is too large.
5777 //
5778 // saddr + large_offset -> saddr +
5779 // (voffset = large_offset & ~MaxOffset) +
5780 // (large_offset & MaxOffset);
5781 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5782 if (NeedIOffset) {
5783 std::tie(SplitImmOffset, RemainderOffset) =
5784 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5786 }
5787
5788 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5789 : isUInt<32>(RemainderOffset)) {
5790 MachineInstr *MI = Root.getParent();
5791 MachineBasicBlock *MBB = MI->getParent();
5792 Register HighBits =
5793 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5794
5795 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5796 HighBits)
5797 .addImm(RemainderOffset);
5798
5799 if (NeedIOffset)
5800 return {{
5801 [=](MachineInstrBuilder &MIB) {
5802 MIB.addReg(PtrBase);
5803 }, // saddr
5804 [=](MachineInstrBuilder &MIB) {
5805 MIB.addReg(HighBits);
5806 }, // voffset
5807 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5808 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5809 }};
5810 return {{
5811 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5812 [=](MachineInstrBuilder &MIB) {
5813 MIB.addReg(HighBits);
5814 }, // voffset
5815 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5816 }};
5817 }
5818 }
5819
5820 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5821 // is 1 we would need to perform 1 or 2 extra moves for each half of
5822 // the constant and it is better to do a scalar add and then issue a
5823 // single VALU instruction to materialize zero. Otherwise it is less
5824 // instructions to perform VALU adds with immediates or inline literals.
5825 unsigned NumLiterals =
5826 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5827 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5828 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5829 return std::nullopt;
5830 }
5831 }
5832 }
5833
5834 // Match the variable offset.
5835 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5836 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5837 // Look through the SGPR->VGPR copy.
5838 Register SAddr =
5839 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5840
5841 if (isSGPR(SAddr)) {
5842 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5843
5844 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5845 // inserted later.
5846 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5847 Subtarget->hasSignedGVSOffset());
5848 if (Register VOffset = matchExtendFromS32OrS32(
5849 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5850 if (NeedIOffset)
5851 return {{[=](MachineInstrBuilder &MIB) { // saddr
5852 MIB.addReg(SAddr);
5853 },
5854 [=](MachineInstrBuilder &MIB) { // voffset
5855 MIB.addReg(VOffset);
5856 },
5857 [=](MachineInstrBuilder &MIB) { // offset
5858 MIB.addImm(ImmOffset);
5859 },
5860 [=](MachineInstrBuilder &MIB) { // cpol
5861 MIB.addImm(CPolBits |
5862 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5863 }}};
5864 return {{[=](MachineInstrBuilder &MIB) { // saddr
5865 MIB.addReg(SAddr);
5866 },
5867 [=](MachineInstrBuilder &MIB) { // voffset
5868 MIB.addReg(VOffset);
5869 },
5870 [=](MachineInstrBuilder &MIB) { // cpol
5871 MIB.addImm(CPolBits |
5872 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5873 }}};
5874 }
5875 }
5876 }
5877
5878 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5879 // drop this.
5880 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5881 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5882 return std::nullopt;
5883
5884 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5885 // moves required to copy a 64-bit SGPR to VGPR.
5886 MachineInstr *MI = Root.getParent();
5887 MachineBasicBlock *MBB = MI->getParent();
5888 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5889
5890 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5891 .addImm(0);
5892
5893 if (NeedIOffset)
5894 return {{
5895 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5896 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5897 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5898 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5899 }};
5900 return {{
5901 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5902 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5903 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5904 }};
5905}
5906
5908AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5909 return selectGlobalSAddr(Root, 0);
5910}
5911
5913AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5914 const MachineInstr &I = *Root.getParent();
5915
5916 // We are assuming CPol is always the last operand of the intrinsic.
5917 auto PassedCPol =
5918 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5919 return selectGlobalSAddr(Root, PassedCPol);
5920}
5921
5923AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5924 const MachineInstr &I = *Root.getParent();
5925
5926 // We are assuming CPol is second from last operand of the intrinsic.
5927 auto PassedCPol =
5928 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5929 return selectGlobalSAddr(Root, PassedCPol);
5930}
5931
5933AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5934 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5935}
5936
5938AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5939 MachineOperand &Root) const {
5940 const MachineInstr &I = *Root.getParent();
5941
5942 // We are assuming CPol is always the last operand of the intrinsic.
5943 auto PassedCPol =
5944 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5945 return selectGlobalSAddr(Root, PassedCPol, false);
5946}
5947
5949AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5950 MachineOperand &Root) const {
5951 const MachineInstr &I = *Root.getParent();
5952
5953 // We are assuming CPol is second from last operand of the intrinsic.
5954 auto PassedCPol =
5955 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5956 return selectGlobalSAddr(Root, PassedCPol, false);
5957}
5958
5960AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5961 Register Addr = Root.getReg();
5962 Register PtrBase;
5963 int64_t ConstOffset;
5964 int64_t ImmOffset = 0;
5965
5966 // Match the immediate offset first, which canonically is moved as low as
5967 // possible.
5968 std::tie(PtrBase, ConstOffset, std::ignore) =
5969 getPtrBaseWithConstantOffset(Addr, *MRI);
5970
5971 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5972 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5974 Addr = PtrBase;
5975 ImmOffset = ConstOffset;
5976 }
5977
5978 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5979 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5980 int FI = AddrDef->MI->getOperand(1).getIndex();
5981 return {{
5982 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5983 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5984 }};
5985 }
5986
5987 Register SAddr = AddrDef->Reg;
5988
5989 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5990 Register LHS = AddrDef->MI->getOperand(1).getReg();
5991 Register RHS = AddrDef->MI->getOperand(2).getReg();
5992 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5993 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5994
5995 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5996 isSGPR(RHSDef->Reg)) {
5997 int FI = LHSDef->MI->getOperand(1).getIndex();
5998 MachineInstr &I = *Root.getParent();
5999 MachineBasicBlock *BB = I.getParent();
6000 const DebugLoc &DL = I.getDebugLoc();
6001 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6002
6003 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6004 .addFrameIndex(FI)
6005 .addReg(RHSDef->Reg)
6006 .setOperandDead(3); // Dead scc
6007 }
6008 }
6009
6010 if (!isSGPR(SAddr))
6011 return std::nullopt;
6012
6013 return {{
6014 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6015 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6016 }};
6017}
6018
6019// Check whether the flat scratch SVS swizzle bug affects this access.
6020bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6021 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6022 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6023 return false;
6024
6025 // The bug affects the swizzling of SVS accesses if there is any carry out
6026 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6027 // voffset to (soffset + inst_offset).
6028 auto VKnown = VT->getKnownBits(VAddr);
6029 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6030 KnownBits::makeConstant(APInt(32, ImmOffset)));
6031 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6032 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6033 return (VMax & 3) + (SMax & 3) >= 4;
6034}
6035
6037AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6038 Register Addr = Root.getReg();
6039 Register PtrBase;
6040 int64_t ConstOffset;
6041 int64_t ImmOffset = 0;
6042
6043 // Match the immediate offset first, which canonically is moved as low as
6044 // possible.
6045 std::tie(PtrBase, ConstOffset, std::ignore) =
6046 getPtrBaseWithConstantOffset(Addr, *MRI);
6047
6048 Register OrigAddr = Addr;
6049 if (ConstOffset != 0 &&
6050 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6052 Addr = PtrBase;
6053 ImmOffset = ConstOffset;
6054 }
6055
6056 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6057 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6058 return std::nullopt;
6059
6060 Register RHS = AddrDef->MI->getOperand(2).getReg();
6061 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6062 return std::nullopt;
6063
6064 Register LHS = AddrDef->MI->getOperand(1).getReg();
6065 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6066
6067 if (OrigAddr != Addr) {
6068 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6069 return std::nullopt;
6070 } else {
6071 if (!isFlatScratchBaseLegalSV(OrigAddr))
6072 return std::nullopt;
6073 }
6074
6075 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6076 return std::nullopt;
6077
6078 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6080 : 0;
6081
6082 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6083 int FI = LHSDef->MI->getOperand(1).getIndex();
6084 return {{
6085 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6086 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6087 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6088 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6089 }};
6090 }
6091
6092 if (!isSGPR(LHS))
6093 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6094 LHS = Def->Reg;
6095
6096 if (!isSGPR(LHS))
6097 return std::nullopt;
6098
6099 return {{
6100 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6101 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6102 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6103 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6104 }};
6105}
6106
6108AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6109 MachineInstr *MI = Root.getParent();
6110 MachineBasicBlock *MBB = MI->getParent();
6111 MachineFunction *MF = MBB->getParent();
6112 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6113
6114 int64_t Offset = 0;
6115 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6116 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
6117 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6118
6119 // TODO: Should this be inside the render function? The iterator seems to
6120 // move.
6121 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6122 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6123 HighBits)
6124 .addImm(Offset & ~MaxOffset);
6125
6126 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6127 MIB.addReg(Info->getScratchRSrcReg());
6128 },
6129 [=](MachineInstrBuilder &MIB) { // vaddr
6130 MIB.addReg(HighBits);
6131 },
6132 [=](MachineInstrBuilder &MIB) { // soffset
6133 // Use constant zero for soffset and rely on eliminateFrameIndex
6134 // to choose the appropriate frame register if need be.
6135 MIB.addImm(0);
6136 },
6137 [=](MachineInstrBuilder &MIB) { // offset
6138 MIB.addImm(Offset & MaxOffset);
6139 }}};
6140 }
6141
6142 assert(Offset == 0 || Offset == -1);
6143
6144 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6145 // offsets.
6146 std::optional<int> FI;
6147 Register VAddr = Root.getReg();
6148
6149 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6150 Register PtrBase;
6151 int64_t ConstOffset;
6152 std::tie(PtrBase, ConstOffset, std::ignore) =
6153 getPtrBaseWithConstantOffset(VAddr, *MRI);
6154 if (ConstOffset != 0) {
6155 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6156 (!STI.privateMemoryResourceIsRangeChecked() ||
6157 VT->signBitIsZero(PtrBase))) {
6158 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6159 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6160 FI = PtrBaseDef->getOperand(1).getIndex();
6161 else
6162 VAddr = PtrBase;
6163 Offset = ConstOffset;
6164 }
6165 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6166 FI = RootDef->getOperand(1).getIndex();
6167 }
6168
6169 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6170 MIB.addReg(Info->getScratchRSrcReg());
6171 },
6172 [=](MachineInstrBuilder &MIB) { // vaddr
6173 if (FI)
6174 MIB.addFrameIndex(*FI);
6175 else
6176 MIB.addReg(VAddr);
6177 },
6178 [=](MachineInstrBuilder &MIB) { // soffset
6179 // Use constant zero for soffset and rely on eliminateFrameIndex
6180 // to choose the appropriate frame register if need be.
6181 MIB.addImm(0);
6182 },
6183 [=](MachineInstrBuilder &MIB) { // offset
6184 MIB.addImm(Offset);
6185 }}};
6186}
6187
6188bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6189 int64_t Offset) const {
6190 if (!isUInt<16>(Offset))
6191 return false;
6192
6193 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6194 return true;
6195
6196 // On Southern Islands instruction with a negative base value and an offset
6197 // don't seem to work.
6198 return VT->signBitIsZero(Base);
6199}
6200
6201bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6202 int64_t Offset1,
6203 unsigned Size) const {
6204 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6205 return false;
6206 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6207 return false;
6208
6209 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6210 return true;
6211
6212 // On Southern Islands instruction with a negative base value and an offset
6213 // don't seem to work.
6214 return VT->signBitIsZero(Base);
6215}
6216
6217// Return whether the operation has NoUnsignedWrap property.
6218static bool isNoUnsignedWrap(MachineInstr *Addr) {
6219 return Addr->getOpcode() == TargetOpcode::G_OR ||
6220 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6222}
6223
6224// Check that the base address of flat scratch load/store in the form of `base +
6225// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6226// requirement). We always treat the first operand as the base address here.
6227bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6228 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6229
6230 if (isNoUnsignedWrap(AddrMI))
6231 return true;
6232
6233 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6234 // values.
6235 if (STI.hasSignedScratchOffsets())
6236 return true;
6237
6238 Register LHS = AddrMI->getOperand(1).getReg();
6239 Register RHS = AddrMI->getOperand(2).getReg();
6240
6241 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6242 std::optional<ValueAndVReg> RhsValReg =
6244 // If the immediate offset is negative and within certain range, the base
6245 // address cannot also be negative. If the base is also negative, the sum
6246 // would be either negative or much larger than the valid range of scratch
6247 // memory a thread can access.
6248 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6249 RhsValReg->Value.getSExtValue() > -0x40000000)
6250 return true;
6251 }
6252
6253 return VT->signBitIsZero(LHS);
6254}
6255
6256// Check address value in SGPR/VGPR are legal for flat scratch in the form
6257// of: SGPR + VGPR.
6258bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6259 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6260
6261 if (isNoUnsignedWrap(AddrMI))
6262 return true;
6263
6264 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6265 // values.
6266 if (STI.hasSignedScratchOffsets())
6267 return true;
6268
6269 Register LHS = AddrMI->getOperand(1).getReg();
6270 Register RHS = AddrMI->getOperand(2).getReg();
6271 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6272}
6273
6274// Check address value in SGPR/VGPR are legal for flat scratch in the form
6275// of: SGPR + VGPR + Imm.
6276bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6277 Register Addr) const {
6278 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6279 // values.
6280 if (STI.hasSignedScratchOffsets())
6281 return true;
6282
6283 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6284 Register Base = AddrMI->getOperand(1).getReg();
6285 std::optional<DefinitionAndSourceRegister> BaseDef =
6287 std::optional<ValueAndVReg> RHSOffset =
6289 assert(RHSOffset);
6290
6291 // If the immediate offset is negative and within certain range, the base
6292 // address cannot also be negative. If the base is also negative, the sum
6293 // would be either negative or much larger than the valid range of scratch
6294 // memory a thread can access.
6295 if (isNoUnsignedWrap(BaseDef->MI) &&
6296 (isNoUnsignedWrap(AddrMI) ||
6297 (RHSOffset->Value.getSExtValue() < 0 &&
6298 RHSOffset->Value.getSExtValue() > -0x40000000)))
6299 return true;
6300
6301 Register LHS = BaseDef->MI->getOperand(1).getReg();
6302 Register RHS = BaseDef->MI->getOperand(2).getReg();
6303 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6304}
6305
6306bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6307 unsigned ShAmtBits) const {
6308 assert(MI.getOpcode() == TargetOpcode::G_AND);
6309
6310 std::optional<APInt> RHS =
6311 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6312 if (!RHS)
6313 return false;
6314
6315 if (RHS->countr_one() >= ShAmtBits)
6316 return true;
6317
6318 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6319 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6320}
6321
6323AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6324 MachineOperand &Root) const {
6325 Register Reg = Root.getReg();
6326 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6327
6328 std::optional<DefinitionAndSourceRegister> Def =
6330 assert(Def && "this shouldn't be an optional result");
6331 Reg = Def->Reg;
6332
6333 if (Register WaveBase = getWaveAddress(Def->MI)) {
6334 return {{
6335 [=](MachineInstrBuilder &MIB) { // rsrc
6336 MIB.addReg(Info->getScratchRSrcReg());
6337 },
6338 [=](MachineInstrBuilder &MIB) { // soffset
6339 MIB.addReg(WaveBase);
6340 },
6341 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6342 }};
6343 }
6344
6345 int64_t Offset = 0;
6346
6347 // FIXME: Copy check is a hack
6349 if (mi_match(Reg, *MRI,
6350 m_GPtrAdd(m_Reg(BasePtr),
6352 if (!TII.isLegalMUBUFImmOffset(Offset))
6353 return {};
6354 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6355 Register WaveBase = getWaveAddress(BasePtrDef);
6356 if (!WaveBase)
6357 return {};
6358
6359 return {{
6360 [=](MachineInstrBuilder &MIB) { // rsrc
6361 MIB.addReg(Info->getScratchRSrcReg());
6362 },
6363 [=](MachineInstrBuilder &MIB) { // soffset
6364 MIB.addReg(WaveBase);
6365 },
6366 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6367 }};
6368 }
6369
6370 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6371 !TII.isLegalMUBUFImmOffset(Offset))
6372 return {};
6373
6374 return {{
6375 [=](MachineInstrBuilder &MIB) { // rsrc
6376 MIB.addReg(Info->getScratchRSrcReg());
6377 },
6378 [=](MachineInstrBuilder &MIB) { // soffset
6379 MIB.addImm(0);
6380 },
6381 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6382 }};
6383}
6384
6385std::pair<Register, unsigned>
6386AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6387 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6388 int64_t ConstAddr = 0;
6389
6390 Register PtrBase;
6391 int64_t Offset;
6392 std::tie(PtrBase, Offset, std::ignore) =
6393 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6394
6395 if (Offset) {
6396 if (isDSOffsetLegal(PtrBase, Offset)) {
6397 // (add n0, c0)
6398 return std::pair(PtrBase, Offset);
6399 }
6400 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6401 // TODO
6402
6403
6404 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6405 // TODO
6406
6407 }
6408
6409 return std::pair(Root.getReg(), 0);
6410}
6411
6413AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6414 Register Reg;
6415 unsigned Offset;
6416 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6417 return {{
6418 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6419 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6420 }};
6421}
6422
6424AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6425 return selectDSReadWrite2(Root, 4);
6426}
6427
6429AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6430 return selectDSReadWrite2(Root, 8);
6431}
6432
6434AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6435 unsigned Size) const {
6436 Register Reg;
6437 unsigned Offset;
6438 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6439 return {{
6440 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6441 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6442 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6443 }};
6444}
6445
6446std::pair<Register, unsigned>
6447AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6448 unsigned Size) const {
6449 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6450 int64_t ConstAddr = 0;
6451
6452 Register PtrBase;
6453 int64_t Offset;
6454 std::tie(PtrBase, Offset, std::ignore) =
6455 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6456
6457 if (Offset) {
6458 int64_t OffsetValue0 = Offset;
6459 int64_t OffsetValue1 = Offset + Size;
6460 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6461 // (add n0, c0)
6462 return std::pair(PtrBase, OffsetValue0 / Size);
6463 }
6464 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6465 // TODO
6466
6467 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6468 // TODO
6469
6470 }
6471
6472 return std::pair(Root.getReg(), 0);
6473}
6474
6475/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6476/// the base value with the constant offset, and if the offset computation is
6477/// known to be inbounds. There may be intervening copies between \p Root and
6478/// the identified constant. Returns \p Root, 0, false if this does not match
6479/// the pattern.
6480std::tuple<Register, int64_t, bool>
6481AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6482 Register Root, const MachineRegisterInfo &MRI) const {
6483 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6484 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6485 return {Root, 0, false};
6486
6487 MachineOperand &RHS = RootI->getOperand(2);
6488 std::optional<ValueAndVReg> MaybeOffset =
6490 if (!MaybeOffset)
6491 return {Root, 0, false};
6492 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6493 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6494 IsInBounds};
6495}
6496
6498 MIB.addImm(0);
6499}
6500
6501/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6502/// BasePtr is not valid, a null base pointer will be used.
6504 uint32_t FormatLo, uint32_t FormatHi,
6505 Register BasePtr) {
6506 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6507 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6508 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6509 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6510
6511 B.buildInstr(AMDGPU::S_MOV_B32)
6512 .addDef(RSrc2)
6513 .addImm(FormatLo);
6514 B.buildInstr(AMDGPU::S_MOV_B32)
6515 .addDef(RSrc3)
6516 .addImm(FormatHi);
6517
6518 // Build the half of the subregister with the constants before building the
6519 // full 128-bit register. If we are building multiple resource descriptors,
6520 // this will allow CSEing of the 2-component register.
6521 B.buildInstr(AMDGPU::REG_SEQUENCE)
6522 .addDef(RSrcHi)
6523 .addReg(RSrc2)
6524 .addImm(AMDGPU::sub0)
6525 .addReg(RSrc3)
6526 .addImm(AMDGPU::sub1);
6527
6528 Register RSrcLo = BasePtr;
6529 if (!BasePtr) {
6530 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6531 B.buildInstr(AMDGPU::S_MOV_B64)
6532 .addDef(RSrcLo)
6533 .addImm(0);
6534 }
6535
6536 B.buildInstr(AMDGPU::REG_SEQUENCE)
6537 .addDef(RSrc)
6538 .addReg(RSrcLo)
6539 .addImm(AMDGPU::sub0_sub1)
6540 .addReg(RSrcHi)
6541 .addImm(AMDGPU::sub2_sub3);
6542
6543 return RSrc;
6544}
6545
6547 const SIInstrInfo &TII, Register BasePtr) {
6548 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6549
6550 // FIXME: Why are half the "default" bits ignored based on the addressing
6551 // mode?
6552 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6553}
6554
6556 const SIInstrInfo &TII, Register BasePtr) {
6557 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6558
6559 // FIXME: Why are half the "default" bits ignored based on the addressing
6560 // mode?
6561 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6562}
6563
6564AMDGPUInstructionSelector::MUBUFAddressData
6565AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6566 MUBUFAddressData Data;
6567 Data.N0 = Src;
6568
6569 Register PtrBase;
6570 int64_t Offset;
6571
6572 std::tie(PtrBase, Offset, std::ignore) =
6573 getPtrBaseWithConstantOffset(Src, *MRI);
6574 if (isUInt<32>(Offset)) {
6575 Data.N0 = PtrBase;
6576 Data.Offset = Offset;
6577 }
6578
6579 if (MachineInstr *InputAdd
6580 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6581 Data.N2 = InputAdd->getOperand(1).getReg();
6582 Data.N3 = InputAdd->getOperand(2).getReg();
6583
6584 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6585 // FIXME: Don't know this was defined by operand 0
6586 //
6587 // TODO: Remove this when we have copy folding optimizations after
6588 // RegBankSelect.
6589 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6590 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6591 }
6592
6593 return Data;
6594}
6595
6596/// Return if the addr64 mubuf mode should be used for the given address.
6597bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6598 // (ptr_add N2, N3) -> addr64, or
6599 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6600 if (Addr.N2)
6601 return true;
6602
6603 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6604 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6605}
6606
6607/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6608/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6609/// component.
6610void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6611 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6612 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6613 return;
6614
6615 // Illegal offset, store it in soffset.
6616 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6617 B.buildInstr(AMDGPU::S_MOV_B32)
6618 .addDef(SOffset)
6619 .addImm(ImmOffset);
6620 ImmOffset = 0;
6621}
6622
6623bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6624 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6625 Register &SOffset, int64_t &Offset) const {
6626 // FIXME: Predicates should stop this from reaching here.
6627 // addr64 bit was removed for volcanic islands.
6628 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6629 return false;
6630
6631 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6632 if (!shouldUseAddr64(AddrData))
6633 return false;
6634
6635 Register N0 = AddrData.N0;
6636 Register N2 = AddrData.N2;
6637 Register N3 = AddrData.N3;
6638 Offset = AddrData.Offset;
6639
6640 // Base pointer for the SRD.
6641 Register SRDPtr;
6642
6643 if (N2) {
6644 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6645 assert(N3);
6646 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6647 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6648 // addr64, and construct the default resource from a 0 address.
6649 VAddr = N0;
6650 } else {
6651 SRDPtr = N3;
6652 VAddr = N2;
6653 }
6654 } else {
6655 // N2 is not divergent.
6656 SRDPtr = N2;
6657 VAddr = N3;
6658 }
6659 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6660 // Use the default null pointer in the resource
6661 VAddr = N0;
6662 } else {
6663 // N0 -> offset, or
6664 // (N0 + C1) -> offset
6665 SRDPtr = N0;
6666 }
6667
6668 MachineIRBuilder B(*Root.getParent());
6669 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6670 splitIllegalMUBUFOffset(B, SOffset, Offset);
6671 return true;
6672}
6673
6674bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6675 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6676 int64_t &Offset) const {
6677
6678 // FIXME: Pattern should not reach here.
6679 if (STI.useFlatForGlobal())
6680 return false;
6681
6682 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6683 if (shouldUseAddr64(AddrData))
6684 return false;
6685
6686 // N0 -> offset, or
6687 // (N0 + C1) -> offset
6688 Register SRDPtr = AddrData.N0;
6689 Offset = AddrData.Offset;
6690
6691 // TODO: Look through extensions for 32-bit soffset.
6692 MachineIRBuilder B(*Root.getParent());
6693
6694 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6695 splitIllegalMUBUFOffset(B, SOffset, Offset);
6696 return true;
6697}
6698
6700AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6701 Register VAddr;
6702 Register RSrcReg;
6703 Register SOffset;
6704 int64_t Offset = 0;
6705
6706 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6707 return {};
6708
6709 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6710 // pattern.
6711 return {{
6712 [=](MachineInstrBuilder &MIB) { // rsrc
6713 MIB.addReg(RSrcReg);
6714 },
6715 [=](MachineInstrBuilder &MIB) { // vaddr
6716 MIB.addReg(VAddr);
6717 },
6718 [=](MachineInstrBuilder &MIB) { // soffset
6719 if (SOffset)
6720 MIB.addReg(SOffset);
6721 else if (STI.hasRestrictedSOffset())
6722 MIB.addReg(AMDGPU::SGPR_NULL);
6723 else
6724 MIB.addImm(0);
6725 },
6726 [=](MachineInstrBuilder &MIB) { // offset
6727 MIB.addImm(Offset);
6728 },
6729 addZeroImm, // cpol
6730 addZeroImm, // tfe
6731 addZeroImm // swz
6732 }};
6733}
6734
6736AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6737 Register RSrcReg;
6738 Register SOffset;
6739 int64_t Offset = 0;
6740
6741 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6742 return {};
6743
6744 return {{
6745 [=](MachineInstrBuilder &MIB) { // rsrc
6746 MIB.addReg(RSrcReg);
6747 },
6748 [=](MachineInstrBuilder &MIB) { // soffset
6749 if (SOffset)
6750 MIB.addReg(SOffset);
6751 else if (STI.hasRestrictedSOffset())
6752 MIB.addReg(AMDGPU::SGPR_NULL);
6753 else
6754 MIB.addImm(0);
6755 },
6756 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6757 addZeroImm, // cpol
6758 addZeroImm, // tfe
6759 addZeroImm, // swz
6760 }};
6761}
6762
6764AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6765
6766 Register SOffset = Root.getReg();
6767
6768 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6769 SOffset = AMDGPU::SGPR_NULL;
6770
6771 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6772}
6773
6774/// Get an immediate that must be 32-bits, and treated as zero extended.
6775static std::optional<uint64_t>
6777 // getIConstantVRegVal sexts any values, so see if that matters.
6778 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6779 if (!OffsetVal || !isInt<32>(*OffsetVal))
6780 return std::nullopt;
6781 return Lo_32(*OffsetVal);
6782}
6783
6785AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6786 std::optional<uint64_t> OffsetVal =
6787 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6788 if (!OffsetVal)
6789 return {};
6790
6791 std::optional<int64_t> EncodedImm =
6792 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6793 if (!EncodedImm)
6794 return {};
6795
6796 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6797}
6798
6800AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6801 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6802
6803 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6804 if (!OffsetVal)
6805 return {};
6806
6807 std::optional<int64_t> EncodedImm =
6809 if (!EncodedImm)
6810 return {};
6811
6812 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6813}
6814
6816AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6817 // Match the (soffset + offset) pair as a 32-bit register base and
6818 // an immediate offset.
6819 Register SOffset;
6820 unsigned Offset;
6821 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6822 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6823 if (!SOffset)
6824 return std::nullopt;
6825
6826 std::optional<int64_t> EncodedOffset =
6827 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6828 if (!EncodedOffset)
6829 return std::nullopt;
6830
6831 assert(MRI->getType(SOffset) == LLT::scalar(32));
6832 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6833 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6834}
6835
6836std::pair<Register, unsigned>
6837AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6838 bool &Matched) const {
6839 Matched = false;
6840
6841 Register Src;
6842 unsigned Mods;
6843 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6844
6845 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6846 assert(MRI->getType(Src) == LLT::scalar(16));
6847
6848 // Only change Src if src modifier could be gained. In such cases new Src
6849 // could be sgpr but this does not violate constant bus restriction for
6850 // instruction that is being selected.
6851 Src = stripBitCast(Src, *MRI);
6852
6853 const auto CheckAbsNeg = [&]() {
6854 // Be careful about folding modifiers if we already have an abs. fneg is
6855 // applied last, so we don't want to apply an earlier fneg.
6856 if ((Mods & SISrcMods::ABS) == 0) {
6857 unsigned ModsTmp;
6858 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6859
6860 if ((ModsTmp & SISrcMods::NEG) != 0)
6861 Mods ^= SISrcMods::NEG;
6862
6863 if ((ModsTmp & SISrcMods::ABS) != 0)
6864 Mods |= SISrcMods::ABS;
6865 }
6866 };
6867
6868 CheckAbsNeg();
6869
6870 // op_sel/op_sel_hi decide the source type and source.
6871 // If the source's op_sel_hi is set, it indicates to do a conversion from
6872 // fp16. If the sources's op_sel is set, it picks the high half of the
6873 // source register.
6874
6875 Mods |= SISrcMods::OP_SEL_1;
6876
6877 if (isExtractHiElt(*MRI, Src, Src)) {
6878 Mods |= SISrcMods::OP_SEL_0;
6879 CheckAbsNeg();
6880 }
6881
6882 Matched = true;
6883 }
6884
6885 return {Src, Mods};
6886}
6887
6889AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6890 MachineOperand &Root) const {
6891 Register Src;
6892 unsigned Mods;
6893 bool Matched;
6894 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6895 if (!Matched)
6896 return {};
6897
6898 return {{
6899 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6900 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6901 }};
6902}
6903
6905AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6906 Register Src;
6907 unsigned Mods;
6908 bool Matched;
6909 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6910
6911 return {{
6912 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6913 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6914 }};
6915}
6916
6917bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6918 MachineInstr &I, Intrinsic::ID IntrID) const {
6919 MachineBasicBlock *MBB = I.getParent();
6920 const DebugLoc &DL = I.getDebugLoc();
6921 Register CCReg = I.getOperand(0).getReg();
6922
6923 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6924 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6925
6926 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6927 .addImm(I.getOperand(2).getImm());
6928
6929 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6930
6931 I.eraseFromParent();
6932 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6933 *MRI);
6934}
6935
6936bool AMDGPUInstructionSelector::selectSGetBarrierState(
6937 MachineInstr &I, Intrinsic::ID IntrID) const {
6938 MachineBasicBlock *MBB = I.getParent();
6939 const DebugLoc &DL = I.getDebugLoc();
6940 const MachineOperand &BarOp = I.getOperand(2);
6941 std::optional<int64_t> BarValImm =
6942 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6943
6944 if (!BarValImm) {
6945 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6946 .addReg(BarOp.getReg());
6947 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6948 }
6949 MachineInstrBuilder MIB;
6950 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6951 : AMDGPU::S_GET_BARRIER_STATE_M0;
6952 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6953
6954 auto DstReg = I.getOperand(0).getReg();
6955 const TargetRegisterClass *DstRC =
6956 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6957 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6958 return false;
6959 MIB.addDef(DstReg);
6960 if (BarValImm) {
6961 MIB.addImm(*BarValImm);
6962 }
6963 I.eraseFromParent();
6964 return true;
6965}
6966
6967unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6968 if (HasInlineConst) {
6969 switch (IntrID) {
6970 default:
6971 llvm_unreachable("not a named barrier op");
6972 case Intrinsic::amdgcn_s_barrier_join:
6973 return AMDGPU::S_BARRIER_JOIN_IMM;
6974 case Intrinsic::amdgcn_s_wakeup_barrier:
6975 return AMDGPU::S_WAKEUP_BARRIER_IMM;
6976 case Intrinsic::amdgcn_s_get_named_barrier_state:
6977 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6978 };
6979 } else {
6980 switch (IntrID) {
6981 default:
6982 llvm_unreachable("not a named barrier op");
6983 case Intrinsic::amdgcn_s_barrier_join:
6984 return AMDGPU::S_BARRIER_JOIN_M0;
6985 case Intrinsic::amdgcn_s_wakeup_barrier:
6986 return AMDGPU::S_WAKEUP_BARRIER_M0;
6987 case Intrinsic::amdgcn_s_get_named_barrier_state:
6988 return AMDGPU::S_GET_BARRIER_STATE_M0;
6989 };
6990 }
6991}
6992
6993bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6994 MachineInstr &I, Intrinsic::ID IntrID) const {
6995 MachineBasicBlock *MBB = I.getParent();
6996 const DebugLoc &DL = I.getDebugLoc();
6997 const MachineOperand &BarOp = I.getOperand(1);
6998 const MachineOperand &CntOp = I.getOperand(2);
6999
7000 // BarID = (BarOp >> 4) & 0x3F
7001 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7002 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7003 .add(BarOp)
7004 .addImm(4u)
7005 .setOperandDead(3); // Dead scc
7006
7007 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7008 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7009 .addReg(TmpReg0)
7010 .addImm(0x3F)
7011 .setOperandDead(3); // Dead scc
7012
7013 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7014 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7015 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7016 .add(CntOp)
7017 .addImm(0x3F)
7018 .setOperandDead(3); // Dead scc
7019
7020 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7021 constexpr unsigned ShAmt = 16;
7022 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7023 .addReg(TmpReg2)
7024 .addImm(ShAmt)
7025 .setOperandDead(3); // Dead scc
7026
7027 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7028 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7029 .addReg(TmpReg1)
7030 .addReg(TmpReg3)
7031 .setOperandDead(3); // Dead scc;
7032
7033 auto CopyMIB =
7034 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7035 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7036
7037 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7038 ? AMDGPU::S_BARRIER_INIT_M0
7039 : AMDGPU::S_BARRIER_SIGNAL_M0;
7040 MachineInstrBuilder MIB;
7041 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7042
7043 I.eraseFromParent();
7044 return true;
7045}
7046
7047bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7048 MachineInstr &I, Intrinsic::ID IntrID) const {
7049 MachineBasicBlock *MBB = I.getParent();
7050 const DebugLoc &DL = I.getDebugLoc();
7051 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7052 ? I.getOperand(2)
7053 : I.getOperand(1);
7054 std::optional<int64_t> BarValImm =
7055 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7056
7057 if (!BarValImm) {
7058 // BarID = (BarOp >> 4) & 0x3F
7059 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7060 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7061 .addReg(BarOp.getReg())
7062 .addImm(4u)
7063 .setOperandDead(3); // Dead scc;
7064
7065 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7066 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7067 .addReg(TmpReg0)
7068 .addImm(0x3F)
7069 .setOperandDead(3); // Dead scc;
7070
7071 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7072 .addReg(TmpReg1);
7073 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7074 }
7075
7076 MachineInstrBuilder MIB;
7077 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7078 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7079
7080 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7081 auto DstReg = I.getOperand(0).getReg();
7082 const TargetRegisterClass *DstRC =
7083 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7084 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7085 return false;
7086 MIB.addDef(DstReg);
7087 }
7088
7089 if (BarValImm) {
7090 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7091 MIB.addImm(BarId);
7092 }
7093
7094 I.eraseFromParent();
7095 return true;
7096}
7097
7098void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7099 const MachineInstr &MI,
7100 int OpIdx) const {
7101 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7102 "Expected G_CONSTANT");
7103 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7104}
7105
7106void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7107 const MachineInstr &MI,
7108 int OpIdx) const {
7109 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7110 "Expected G_CONSTANT");
7111 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7112}
7113
7114void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7115 const MachineInstr &MI,
7116 int OpIdx) const {
7117 const MachineOperand &Op = MI.getOperand(1);
7118 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7119 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7120}
7121
7122void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
7123 const MachineInstr &MI,
7124 int OpIdx) const {
7125 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7126 "Expected G_CONSTANT");
7127 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
7128}
7129
7130/// This only really exists to satisfy DAG type checking machinery, so is a
7131/// no-op here.
7132void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7133 const MachineInstr &MI,
7134 int OpIdx) const {
7135 const MachineOperand &Op = MI.getOperand(OpIdx);
7136 int64_t Imm;
7137 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7138 MIB.addImm(Imm);
7139 else
7140 MIB.addImm(Op.getImm());
7141}
7142
7143void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7144 const MachineInstr &MI,
7145 int OpIdx) const {
7146 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7147}
7148
7149void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7150 const MachineInstr &MI,
7151 int OpIdx) const {
7152 assert(OpIdx >= 0 && "expected to match an immediate operand");
7153 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7154}
7155
7156void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7157 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7158 assert(OpIdx >= 0 && "expected to match an immediate operand");
7159 MIB.addImm(
7160 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7161}
7162
7163void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7164 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7165 assert(OpIdx >= 0 && "expected to match an immediate operand");
7166 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7168 : (int64_t)SISrcMods::DST_OP_SEL);
7169}
7170
7171void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7172 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7173 assert(OpIdx >= 0 && "expected to match an immediate operand");
7174 MIB.addImm(
7175 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7176}
7177
7178void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7179 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7180 assert(OpIdx >= 0 && "expected to match an immediate operand");
7181 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7182 ? (int64_t)(SISrcMods::OP_SEL_0)
7183 : 0);
7184}
7185
7186void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7187 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7188 assert(OpIdx >= 0 && "expected to match an immediate operand");
7189 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7190 : 0);
7191}
7192
7193void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7194 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7195 assert(OpIdx >= 0 && "expected to match an immediate operand");
7196 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7197 : 0);
7198}
7199
7200void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7201 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7202 assert(OpIdx >= 0 && "expected to match an immediate operand");
7203 MIB.addImm(
7204 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7205}
7206
7207void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7208 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7209 assert(OpIdx >= 0 && "expected to match an immediate operand");
7210 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7211 ? (int64_t)SISrcMods::DST_OP_SEL
7212 : 0);
7213}
7214
7215void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7216 const MachineInstr &MI,
7217 int OpIdx) const {
7218 assert(OpIdx >= 0 && "expected to match an immediate operand");
7219 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7222}
7223
7224void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7225 const MachineInstr &MI,
7226 int OpIdx) const {
7227 assert(OpIdx >= 0 && "expected to match an immediate operand");
7228 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7231 MIB.addImm(Swizzle);
7232}
7233
7234void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7235 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7236 assert(OpIdx >= 0 && "expected to match an immediate operand");
7237 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7240 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7241}
7242
7243void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7244 const MachineInstr &MI,
7245 int OpIdx) const {
7246 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7247}
7248
7249void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7250 const MachineInstr &MI,
7251 int OpIdx) const {
7252 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7253 int ExpVal = APF.getExactLog2Abs();
7254 assert(ExpVal != INT_MIN);
7255 MIB.addImm(ExpVal);
7256}
7257
7258void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7259 const MachineInstr &MI,
7260 int OpIdx) const {
7261 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7262 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7263 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7264 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7265 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7266}
7267
7268void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7269 const MachineInstr &MI,
7270 int OpIdx) const {
7271 unsigned Mods = SISrcMods::OP_SEL_1;
7272 if (MI.getOperand(OpIdx).getImm())
7273 Mods ^= SISrcMods::NEG;
7274 MIB.addImm((int64_t)Mods);
7275}
7276
7277void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7278 const MachineInstr &MI,
7279 int OpIdx) const {
7280 unsigned Mods = SISrcMods::OP_SEL_1;
7281 if (MI.getOperand(OpIdx).getImm())
7283 MIB.addImm((int64_t)Mods);
7284}
7285
7286void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7287 const MachineInstr &MI,
7288 int OpIdx) const {
7289 unsigned Val = MI.getOperand(OpIdx).getImm();
7290 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7291 if (Val == 1) // neg
7292 Mods ^= SISrcMods::NEG;
7293 if (Val == 2) // abs
7294 Mods ^= SISrcMods::ABS;
7295 if (Val == 3) // neg and abs
7296 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7297 MIB.addImm((int64_t)Mods);
7298}
7299
7300void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7301 const MachineInstr &MI,
7302 int OpIdx) const {
7303 uint32_t V = MI.getOperand(2).getImm();
7306 if (!Subtarget->hasSafeCUPrefetch())
7307 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7308 MIB.addImm(V);
7309}
7310
7311/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7312void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7313 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7314 unsigned Val = MI.getOperand(OpIdx).getImm();
7315 unsigned New = 0;
7316 if (Val & 0x1)
7318 if (Val & 0x2)
7320 MIB.addImm(New);
7321}
7322
7323bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7324 return TII.isInlineConstant(Imm);
7325}
7326
7327bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7328 return TII.isInlineConstant(Imm);
7329}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:922
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:654
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:462
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:295
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:495
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:315
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:440
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:470
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:502
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.