LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
246
247 Register DstReg = I.getOperand(0).getReg();
248 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
249
250 I.eraseFromParent();
251 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
252}
253
254bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
255 const DebugLoc &DL = I.getDebugLoc();
256 MachineBasicBlock *BB = I.getParent();
257
258 Register DstReg = I.getOperand(0).getReg();
259 Register SrcReg = I.getOperand(1).getReg();
260 std::optional<ValueAndVReg> Arg =
261 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
262
263 if (Arg) {
264 const int64_t Value = Arg->Value.getZExtValue();
265 if (Value == 0) {
266 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
267 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
268 } else {
269 assert(Value == 1);
270 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
271 }
272 I.eraseFromParent();
273 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
274 }
275
276 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
277 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
278
279 unsigned SelectOpcode =
280 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
281 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
282 .addReg(TRI.getExec())
283 .addImm(0);
284
285 I.eraseFromParent();
287 return true;
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302 return true;
303}
304
305bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
306 const Register DefReg = I.getOperand(0).getReg();
307 const LLT DefTy = MRI->getType(DefReg);
308
309 // S1 G_PHIs should not be selected in instruction-select, instead:
310 // - divergent S1 G_PHI should go through lane mask merging algorithm
311 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
312 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
313 if (DefTy == LLT::scalar(1))
314 return false;
315
316 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
317
318 const RegClassOrRegBank &RegClassOrBank =
319 MRI->getRegClassOrRegBank(DefReg);
320
321 const TargetRegisterClass *DefRC =
323 if (!DefRC) {
324 if (!DefTy.isValid()) {
325 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
326 return false;
327 }
328
329 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
330 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
331 if (!DefRC) {
332 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
333 return false;
334 }
335 }
336
337 // If inputs have register bank, assign corresponding reg class.
338 // Note: registers don't need to have the same reg bank.
339 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
340 const Register SrcReg = I.getOperand(i).getReg();
341
342 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
343 if (RB) {
344 const LLT SrcTy = MRI->getType(SrcReg);
345 const TargetRegisterClass *SrcRC =
346 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
347 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
348 return false;
349 }
350 }
351
352 I.setDesc(TII.get(TargetOpcode::PHI));
353 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
354}
355
357AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
358 const TargetRegisterClass &SubRC,
359 unsigned SubIdx) const {
360
361 MachineInstr *MI = MO.getParent();
362 MachineBasicBlock *BB = MO.getParent()->getParent();
363 Register DstReg = MRI->createVirtualRegister(&SubRC);
364
365 if (MO.isReg()) {
366 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
367 Register Reg = MO.getReg();
368 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
369 .addReg(Reg, {}, ComposedSubIdx);
370
371 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
372 MO.isKill(), MO.isDead(), MO.isUndef(),
373 MO.isEarlyClobber(), 0, MO.isDebug(),
374 MO.isInternalRead());
375 }
376
377 assert(MO.isImm());
378
379 APInt Imm(64, MO.getImm());
380
381 switch (SubIdx) {
382 default:
383 llvm_unreachable("do not know to split immediate with this sub index.");
384 case AMDGPU::sub0:
385 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
386 case AMDGPU::sub1:
387 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
388 }
389}
390
391static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
392 switch (Opc) {
393 case AMDGPU::G_AND:
394 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
395 case AMDGPU::G_OR:
396 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
397 case AMDGPU::G_XOR:
398 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
399 default:
400 llvm_unreachable("not a bit op");
401 }
402}
403
404bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
405 Register DstReg = I.getOperand(0).getReg();
406 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
407
408 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
409 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
410 DstRB->getID() != AMDGPU::VCCRegBankID)
411 return false;
412
413 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
414 STI.isWave64());
415 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
416
417 // Dead implicit-def of scc
418 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
419 true, // isImp
420 false, // isKill
421 true)); // isDead
422 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
423 return true;
424}
425
426bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
427 MachineBasicBlock *BB = I.getParent();
428 MachineFunction *MF = BB->getParent();
429 Register DstReg = I.getOperand(0).getReg();
430 const DebugLoc &DL = I.getDebugLoc();
431 LLT Ty = MRI->getType(DstReg);
432 if (Ty.isVector())
433 return false;
434
435 unsigned Size = Ty.getSizeInBits();
436 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
437 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
438 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
439
440 if (Size == 32) {
441 if (IsSALU) {
442 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
443 MachineInstr *Add =
444 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
445 .add(I.getOperand(1))
446 .add(I.getOperand(2))
447 .setOperandDead(3); // Dead scc
448 I.eraseFromParent();
449 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
450 return true;
451 }
452
453 if (STI.hasAddNoCarryInsts()) {
454 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
455 I.setDesc(TII.get(Opc));
456 I.addOperand(*MF, MachineOperand::CreateImm(0));
457 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
458 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
459 return true;
460 }
461
462 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
463
464 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
465 MachineInstr *Add
466 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
467 .addDef(UnusedCarry, RegState::Dead)
468 .add(I.getOperand(1))
469 .add(I.getOperand(2))
470 .addImm(0);
471 I.eraseFromParent();
472 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
473 return true;
474 }
475
476 assert(!Sub && "illegal sub should not reach here");
477
478 const TargetRegisterClass &RC
479 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
480 const TargetRegisterClass &HalfRC
481 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
482
483 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
484 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
485 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
486 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
487
488 Register DstLo = MRI->createVirtualRegister(&HalfRC);
489 Register DstHi = MRI->createVirtualRegister(&HalfRC);
490
491 if (IsSALU) {
492 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
493 .add(Lo1)
494 .add(Lo2);
495 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
496 .add(Hi1)
497 .add(Hi2)
498 .setOperandDead(3); // Dead scc
499 } else {
500 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
501 Register CarryReg = MRI->createVirtualRegister(CarryRC);
502 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
503 .addDef(CarryReg)
504 .add(Lo1)
505 .add(Lo2)
506 .addImm(0);
507 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
508 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
509 .add(Hi1)
510 .add(Hi2)
511 .addReg(CarryReg, RegState::Kill)
512 .addImm(0);
513
514 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
515 }
516
517 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
518 .addReg(DstLo)
519 .addImm(AMDGPU::sub0)
520 .addReg(DstHi)
521 .addImm(AMDGPU::sub1);
522
523
524 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
525 return false;
526
527 I.eraseFromParent();
528 return true;
529}
530
531bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
532 MachineInstr &I) const {
533 MachineBasicBlock *BB = I.getParent();
534 MachineFunction *MF = BB->getParent();
535 const DebugLoc &DL = I.getDebugLoc();
536 Register Dst0Reg = I.getOperand(0).getReg();
537 Register Dst1Reg = I.getOperand(1).getReg();
538 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
539 I.getOpcode() == AMDGPU::G_UADDE;
540 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
541 I.getOpcode() == AMDGPU::G_USUBE;
542
543 if (isVCC(Dst1Reg, *MRI)) {
544 unsigned NoCarryOpc =
545 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
546 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
547 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
548 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
549 I.addOperand(*MF, MachineOperand::CreateImm(0));
550 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
551 return true;
552 }
553
554 Register Src0Reg = I.getOperand(2).getReg();
555 Register Src1Reg = I.getOperand(3).getReg();
556
557 if (HasCarryIn) {
558 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
559 .addReg(I.getOperand(4).getReg());
560 }
561
562 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
563 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
564
565 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
566 .add(I.getOperand(2))
567 .add(I.getOperand(3));
568
569 if (MRI->use_nodbg_empty(Dst1Reg)) {
570 CarryInst.setOperandDead(3); // Dead scc
571 } else {
572 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
573 .addReg(AMDGPU::SCC);
574 if (!MRI->getRegClassOrNull(Dst1Reg))
575 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
576 }
577
578 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
579 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
580 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 if (HasCarryIn &&
584 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
585 AMDGPU::SReg_32RegClass, *MRI))
586 return false;
587
588 I.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
593 MachineInstr &I) const {
594 MachineBasicBlock *BB = I.getParent();
595 MachineFunction *MF = BB->getParent();
596 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
597 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
598 MRI->use_nodbg_empty(I.getOperand(1).getReg());
599
600 unsigned Opc;
601 if (Subtarget->hasMADIntraFwdBug())
602 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
603 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
604 else if (UseNoCarry)
605 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
606 : AMDGPU::V_MAD_NC_I64_I32_e64;
607 else
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
609
610 if (UseNoCarry)
611 I.removeOperand(1);
612
613 I.setDesc(TII.get(Opc));
614 I.addOperand(*MF, MachineOperand::CreateImm(0));
615 I.addImplicitDefUseOperands(*MF);
616 I.getOperand(0).setIsEarlyClobber(true);
617 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
618 return true;
619}
620
621// TODO: We should probably legalize these to only using 32-bit results.
622bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
623 MachineBasicBlock *BB = I.getParent();
624 Register DstReg = I.getOperand(0).getReg();
625 Register SrcReg = I.getOperand(1).getReg();
626 LLT DstTy = MRI->getType(DstReg);
627 LLT SrcTy = MRI->getType(SrcReg);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629 unsigned DstSize = DstTy.getSizeInBits();
630
631 // TODO: Should handle any multiple of 32 offset.
632 unsigned Offset = I.getOperand(2).getImm();
633 if (Offset % 32 != 0 || DstSize > 128)
634 return false;
635
636 // 16-bit operations really use 32-bit registers.
637 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
638 if (DstSize == 16)
639 DstSize = 32;
640
641 const TargetRegisterClass *DstRC =
642 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
643 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
644 return false;
645
646 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
647 const TargetRegisterClass *SrcRC =
648 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
649 if (!SrcRC)
650 return false;
651 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
652 DstSize / 32);
653 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
654 if (!SrcRC)
655 return false;
656
657 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
658 *SrcRC, I.getOperand(1));
659 const DebugLoc &DL = I.getDebugLoc();
660 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
661 .addReg(SrcReg, {}, SubReg);
662
663 I.eraseFromParent();
664 return true;
665}
666
667bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
668 MachineBasicBlock *BB = MI.getParent();
669 Register DstReg = MI.getOperand(0).getReg();
670 LLT DstTy = MRI->getType(DstReg);
671 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
672
673 const unsigned SrcSize = SrcTy.getSizeInBits();
674 if (SrcSize < 32)
675 return selectImpl(MI, *CoverageInfo);
676
677 const DebugLoc &DL = MI.getDebugLoc();
678 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
679 const unsigned DstSize = DstTy.getSizeInBits();
680 const TargetRegisterClass *DstRC =
681 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
682 if (!DstRC)
683 return false;
684
685 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
686 MachineInstrBuilder MIB =
687 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
688 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
689 MachineOperand &Src = MI.getOperand(I + 1);
690 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
691 MIB.addImm(SubRegs[I]);
692
693 const TargetRegisterClass *SrcRC
694 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
695 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
696 return false;
697 }
698
699 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
700 return false;
701
702 MI.eraseFromParent();
703 return true;
704}
705
706bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
707 MachineBasicBlock *BB = MI.getParent();
708 const int NumDst = MI.getNumOperands() - 1;
709
710 MachineOperand &Src = MI.getOperand(NumDst);
711
712 Register SrcReg = Src.getReg();
713 Register DstReg0 = MI.getOperand(0).getReg();
714 LLT DstTy = MRI->getType(DstReg0);
715 LLT SrcTy = MRI->getType(SrcReg);
716
717 const unsigned DstSize = DstTy.getSizeInBits();
718 const unsigned SrcSize = SrcTy.getSizeInBits();
719 const DebugLoc &DL = MI.getDebugLoc();
720 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
721
722 const TargetRegisterClass *SrcRC =
723 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
724 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
725 return false;
726
727 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
728 // source, and this relies on the fact that the same subregister indices are
729 // used for both.
730 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
731 for (int I = 0, E = NumDst; I != E; ++I) {
732 MachineOperand &Dst = MI.getOperand(I);
733 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
734 .addReg(SrcReg, {}, SubRegs[I]);
735
736 // Make sure the subregister index is valid for the source register.
737 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
738 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
739 return false;
740
741 const TargetRegisterClass *DstRC =
742 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
743 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
744 return false;
745 }
746
747 MI.eraseFromParent();
748 return true;
749}
750
751bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
752 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
753 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
754
755 Register Src0 = MI.getOperand(1).getReg();
756 Register Src1 = MI.getOperand(2).getReg();
757 LLT SrcTy = MRI->getType(Src0);
758 const unsigned SrcSize = SrcTy.getSizeInBits();
759
760 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
761 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
762 return selectG_MERGE_VALUES(MI);
763 }
764
765 // Selection logic below is for V2S16 only.
766 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
767 Register Dst = MI.getOperand(0).getReg();
768 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
769 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
770 SrcTy != LLT::scalar(32)))
771 return selectImpl(MI, *CoverageInfo);
772
773 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
774 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
775 return false;
776
777 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
778 DstBank->getID() == AMDGPU::VGPRRegBankID);
779 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
780
781 const DebugLoc &DL = MI.getDebugLoc();
782 MachineBasicBlock *BB = MI.getParent();
783
784 // First, before trying TableGen patterns, check if both sources are
785 // constants. In those cases, we can trivially compute the final constant
786 // and emit a simple move.
787 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
788 if (ConstSrc1) {
789 auto ConstSrc0 =
790 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
791 if (ConstSrc0) {
792 const int64_t K0 = ConstSrc0->Value.getSExtValue();
793 const int64_t K1 = ConstSrc1->Value.getSExtValue();
794 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
795 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
796 uint32_t Imm = Lo16 | (Hi16 << 16);
797
798 // VALU
799 if (IsVector) {
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
803 }
804
805 // SALU
806 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
807 MI.eraseFromParent();
808 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
809 }
810 }
811
812 // Now try TableGen patterns.
813 if (selectImpl(MI, *CoverageInfo))
814 return true;
815
816 // TODO: This should probably be a combine somewhere
817 // (build_vector $src0, undef) -> copy $src0
818 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
819 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
820 MI.setDesc(TII.get(AMDGPU::COPY));
821 MI.removeOperand(2);
822 const auto &RC =
823 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
824 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
825 RBI.constrainGenericRegister(Src0, RC, *MRI);
826 }
827
828 // TODO: Can be improved?
829 if (IsVector) {
830 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
831 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
832 .addImm(0xFFFF)
833 .addReg(Src0);
834 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
835
836 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
837 .addReg(Src1)
838 .addImm(16)
839 .addReg(TmpReg);
840 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
841
842 MI.eraseFromParent();
843 return true;
844 }
845
846 Register ShiftSrc0;
847 Register ShiftSrc1;
848
849 // With multiple uses of the shift, this will duplicate the shift and
850 // increase register pressure.
851 //
852 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
853 // => (S_PACK_HH_B32_B16 $src0, $src1)
854 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
855 // => (S_PACK_HL_B32_B16 $src0, $src1)
856 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
857 // => (S_PACK_LH_B32_B16 $src0, $src1)
858 // (build_vector $src0, $src1)
859 // => (S_PACK_LL_B32_B16 $src0, $src1)
860
861 bool Shift0 = mi_match(
862 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
863
864 bool Shift1 = mi_match(
865 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
866
867 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
868 if (Shift0 && Shift1) {
869 Opc = AMDGPU::S_PACK_HH_B32_B16;
870 MI.getOperand(1).setReg(ShiftSrc0);
871 MI.getOperand(2).setReg(ShiftSrc1);
872 } else if (Shift1) {
873 Opc = AMDGPU::S_PACK_LH_B32_B16;
874 MI.getOperand(2).setReg(ShiftSrc1);
875 } else if (Shift0) {
876 auto ConstSrc1 =
877 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
878 if (ConstSrc1 && ConstSrc1->Value == 0) {
879 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
880 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
881 .addReg(ShiftSrc0)
882 .addImm(16)
883 .setOperandDead(3); // Dead scc
884
885 MI.eraseFromParent();
886 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
887 return true;
888 }
889 if (STI.hasSPackHL()) {
890 Opc = AMDGPU::S_PACK_HL_B32_B16;
891 MI.getOperand(1).setReg(ShiftSrc0);
892 }
893 }
894
895 MI.setDesc(TII.get(Opc));
896 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
897 return true;
898}
899
900bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
901 const MachineOperand &MO = I.getOperand(0);
902
903 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
904 // regbank check here is to know why getConstrainedRegClassForOperand failed.
905 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
906 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
907 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
908 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
909 return true;
910 }
911
912 return false;
913}
914
915bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
916 MachineBasicBlock *BB = I.getParent();
917
918 Register DstReg = I.getOperand(0).getReg();
919 Register Src0Reg = I.getOperand(1).getReg();
920 Register Src1Reg = I.getOperand(2).getReg();
921 LLT Src1Ty = MRI->getType(Src1Reg);
922
923 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
924 unsigned InsSize = Src1Ty.getSizeInBits();
925
926 int64_t Offset = I.getOperand(3).getImm();
927
928 // FIXME: These cases should have been illegal and unnecessary to check here.
929 if (Offset % 32 != 0 || InsSize % 32 != 0)
930 return false;
931
932 // Currently not handled by getSubRegFromChannel.
933 if (InsSize > 128)
934 return false;
935
936 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
937 if (SubReg == AMDGPU::NoSubRegister)
938 return false;
939
940 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
941 const TargetRegisterClass *DstRC =
942 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
943 if (!DstRC)
944 return false;
945
946 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
947 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
948 const TargetRegisterClass *Src0RC =
949 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
950 const TargetRegisterClass *Src1RC =
951 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
952
953 // Deal with weird cases where the class only partially supports the subreg
954 // index.
955 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
956 if (!Src0RC || !Src1RC)
957 return false;
958
959 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
960 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
961 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
962 return false;
963
964 const DebugLoc &DL = I.getDebugLoc();
965 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
966 .addReg(Src0Reg)
967 .addReg(Src1Reg)
968 .addImm(SubReg);
969
970 I.eraseFromParent();
971 return true;
972}
973
974bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
975 Register DstReg = MI.getOperand(0).getReg();
976 Register SrcReg = MI.getOperand(1).getReg();
977 Register OffsetReg = MI.getOperand(2).getReg();
978 Register WidthReg = MI.getOperand(3).getReg();
979
980 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
981 "scalar BFX instructions are expanded in regbankselect");
982 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
983 "64-bit vector BFX instructions are expanded in regbankselect");
984
985 const DebugLoc &DL = MI.getDebugLoc();
986 MachineBasicBlock *MBB = MI.getParent();
987
988 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
989 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
990 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
991 .addReg(SrcReg)
992 .addReg(OffsetReg)
993 .addReg(WidthReg);
994 MI.eraseFromParent();
995 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
996 return true;
997}
998
999bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1000 if (STI.getLDSBankCount() != 16)
1001 return selectImpl(MI, *CoverageInfo);
1002
1003 Register Dst = MI.getOperand(0).getReg();
1004 Register Src0 = MI.getOperand(2).getReg();
1005 Register M0Val = MI.getOperand(6).getReg();
1006 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1007 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1008 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1009 return false;
1010
1011 // This requires 2 instructions. It is possible to write a pattern to support
1012 // this, but the generated isel emitter doesn't correctly deal with multiple
1013 // output instructions using the same physical register input. The copy to m0
1014 // is incorrectly placed before the second instruction.
1015 //
1016 // TODO: Match source modifiers.
1017
1018 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1019 const DebugLoc &DL = MI.getDebugLoc();
1020 MachineBasicBlock *MBB = MI.getParent();
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1023 .addReg(M0Val);
1024 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1025 .addImm(2)
1026 .addImm(MI.getOperand(4).getImm()) // $attr
1027 .addImm(MI.getOperand(3).getImm()); // $attrchan
1028
1029 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1030 .addImm(0) // $src0_modifiers
1031 .addReg(Src0) // $src0
1032 .addImm(MI.getOperand(4).getImm()) // $attr
1033 .addImm(MI.getOperand(3).getImm()) // $attrchan
1034 .addImm(0) // $src2_modifiers
1035 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1036 .addImm(MI.getOperand(5).getImm()) // $high
1037 .addImm(0) // $clamp
1038 .addImm(0); // $omod
1039
1040 MI.eraseFromParent();
1041 return true;
1042}
1043
1044// Writelane is special in that it can use SGPR and M0 (which would normally
1045// count as using the constant bus twice - but in this case it is allowed since
1046// the lane selector doesn't count as a use of the constant bus). However, it is
1047// still required to abide by the 1 SGPR rule. Fix this up if we might have
1048// multiple SGPRs.
1049bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1050 // With a constant bus limit of at least 2, there's no issue.
1051 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1052 return selectImpl(MI, *CoverageInfo);
1053
1054 MachineBasicBlock *MBB = MI.getParent();
1055 const DebugLoc &DL = MI.getDebugLoc();
1056 Register VDst = MI.getOperand(0).getReg();
1057 Register Val = MI.getOperand(2).getReg();
1058 Register LaneSelect = MI.getOperand(3).getReg();
1059 Register VDstIn = MI.getOperand(4).getReg();
1060
1061 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1062
1063 std::optional<ValueAndVReg> ConstSelect =
1064 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1065 if (ConstSelect) {
1066 // The selector has to be an inline immediate, so we can use whatever for
1067 // the other operands.
1068 MIB.addReg(Val);
1069 MIB.addImm(ConstSelect->Value.getSExtValue() &
1070 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1071 } else {
1072 std::optional<ValueAndVReg> ConstVal =
1074
1075 // If the value written is an inline immediate, we can get away without a
1076 // copy to m0.
1077 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1078 STI.hasInv2PiInlineImm())) {
1079 MIB.addImm(ConstVal->Value.getSExtValue());
1080 MIB.addReg(LaneSelect);
1081 } else {
1082 MIB.addReg(Val);
1083
1084 // If the lane selector was originally in a VGPR and copied with
1085 // readfirstlane, there's a hazard to read the same SGPR from the
1086 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1087 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1088
1089 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1090 .addReg(LaneSelect);
1091 MIB.addReg(AMDGPU::M0);
1092 }
1093 }
1094
1095 MIB.addReg(VDstIn);
1096
1097 MI.eraseFromParent();
1098 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1099 return true;
1100}
1101
1102// We need to handle this here because tablegen doesn't support matching
1103// instructions with multiple outputs.
1104bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1105 Register Dst0 = MI.getOperand(0).getReg();
1106 Register Dst1 = MI.getOperand(1).getReg();
1107
1108 LLT Ty = MRI->getType(Dst0);
1109 unsigned Opc;
1110 if (Ty == LLT::scalar(32))
1111 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1112 else if (Ty == LLT::scalar(64))
1113 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1114 else
1115 return false;
1116
1117 // TODO: Match source modifiers.
1118
1119 const DebugLoc &DL = MI.getDebugLoc();
1120 MachineBasicBlock *MBB = MI.getParent();
1121
1122 Register Numer = MI.getOperand(3).getReg();
1123 Register Denom = MI.getOperand(4).getReg();
1124 unsigned ChooseDenom = MI.getOperand(5).getImm();
1125
1126 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1127
1128 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1129 .addDef(Dst1)
1130 .addImm(0) // $src0_modifiers
1131 .addUse(Src0) // $src0
1132 .addImm(0) // $src1_modifiers
1133 .addUse(Denom) // $src1
1134 .addImm(0) // $src2_modifiers
1135 .addUse(Numer) // $src2
1136 .addImm(0) // $clamp
1137 .addImm(0); // $omod
1138
1139 MI.eraseFromParent();
1140 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1141 return true;
1142}
1143
1144bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1145 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1146 switch (IntrinsicID) {
1147 case Intrinsic::amdgcn_if_break: {
1148 MachineBasicBlock *BB = I.getParent();
1149
1150 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1151 // SelectionDAG uses for wave32 vs wave64.
1152 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1153 .add(I.getOperand(0))
1154 .add(I.getOperand(2))
1155 .add(I.getOperand(3));
1156
1157 Register DstReg = I.getOperand(0).getReg();
1158 Register Src0Reg = I.getOperand(2).getReg();
1159 Register Src1Reg = I.getOperand(3).getReg();
1160
1161 I.eraseFromParent();
1162
1163 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1164 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1165
1166 return true;
1167 }
1168 case Intrinsic::amdgcn_interp_p1_f16:
1169 return selectInterpP1F16(I);
1170 case Intrinsic::amdgcn_wqm:
1171 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1172 case Intrinsic::amdgcn_softwqm:
1173 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1174 case Intrinsic::amdgcn_strict_wwm:
1175 case Intrinsic::amdgcn_wwm:
1176 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1177 case Intrinsic::amdgcn_strict_wqm:
1178 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1179 case Intrinsic::amdgcn_writelane:
1180 return selectWritelane(I);
1181 case Intrinsic::amdgcn_div_scale:
1182 return selectDivScale(I);
1183 case Intrinsic::amdgcn_icmp:
1184 case Intrinsic::amdgcn_fcmp:
1185 if (selectImpl(I, *CoverageInfo))
1186 return true;
1187 return selectIntrinsicCmp(I);
1188 case Intrinsic::amdgcn_ballot:
1189 return selectBallot(I);
1190 case Intrinsic::amdgcn_reloc_constant:
1191 return selectRelocConstant(I);
1192 case Intrinsic::amdgcn_groupstaticsize:
1193 return selectGroupStaticSize(I);
1194 case Intrinsic::returnaddress:
1195 return selectReturnAddress(I);
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1198 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1200 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1201 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1202 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1204 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1205 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1206 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1208 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1209 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1212 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1214 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1215 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1216 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1217 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1218 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1219 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1220 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1221 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1222 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1223 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1224 return selectSMFMACIntrin(I);
1225 case Intrinsic::amdgcn_permlane16_swap:
1226 case Intrinsic::amdgcn_permlane32_swap:
1227 return selectPermlaneSwapIntrin(I, IntrinsicID);
1228 case Intrinsic::amdgcn_wave_shuffle:
1229 return selectWaveShuffleIntrin(I);
1230 default:
1231 return selectImpl(I, *CoverageInfo);
1232 }
1233}
1234
1236 const GCNSubtarget &ST) {
1237 if (Size != 16 && Size != 32 && Size != 64)
1238 return -1;
1239
1240 if (Size == 16 && !ST.has16BitInsts())
1241 return -1;
1242
1243 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1244 unsigned FakeS16Opc, unsigned S32Opc,
1245 unsigned S64Opc) {
1246 if (Size == 16)
1247 return ST.hasTrue16BitInsts()
1248 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1249 : S16Opc;
1250 if (Size == 32)
1251 return S32Opc;
1252 return S64Opc;
1253 };
1254
1255 switch (P) {
1256 default:
1257 llvm_unreachable("Unknown condition code!");
1258 case CmpInst::ICMP_NE:
1259 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1260 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1261 AMDGPU::V_CMP_NE_U64_e64);
1262 case CmpInst::ICMP_EQ:
1263 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1264 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1265 AMDGPU::V_CMP_EQ_U64_e64);
1266 case CmpInst::ICMP_SGT:
1267 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1268 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1269 AMDGPU::V_CMP_GT_I64_e64);
1270 case CmpInst::ICMP_SGE:
1271 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1272 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1273 AMDGPU::V_CMP_GE_I64_e64);
1274 case CmpInst::ICMP_SLT:
1275 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1276 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1277 AMDGPU::V_CMP_LT_I64_e64);
1278 case CmpInst::ICMP_SLE:
1279 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1280 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1281 AMDGPU::V_CMP_LE_I64_e64);
1282 case CmpInst::ICMP_UGT:
1283 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1284 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1285 AMDGPU::V_CMP_GT_U64_e64);
1286 case CmpInst::ICMP_UGE:
1287 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1288 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1289 AMDGPU::V_CMP_GE_U64_e64);
1290 case CmpInst::ICMP_ULT:
1291 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1292 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1293 AMDGPU::V_CMP_LT_U64_e64);
1294 case CmpInst::ICMP_ULE:
1295 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1296 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1297 AMDGPU::V_CMP_LE_U64_e64);
1298
1299 case CmpInst::FCMP_OEQ:
1300 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1301 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1302 AMDGPU::V_CMP_EQ_F64_e64);
1303 case CmpInst::FCMP_OGT:
1304 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1305 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1306 AMDGPU::V_CMP_GT_F64_e64);
1307 case CmpInst::FCMP_OGE:
1308 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1309 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1310 AMDGPU::V_CMP_GE_F64_e64);
1311 case CmpInst::FCMP_OLT:
1312 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1313 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1314 AMDGPU::V_CMP_LT_F64_e64);
1315 case CmpInst::FCMP_OLE:
1316 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1317 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1318 AMDGPU::V_CMP_LE_F64_e64);
1319 case CmpInst::FCMP_ONE:
1320 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1321 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1322 AMDGPU::V_CMP_NEQ_F64_e64);
1323 case CmpInst::FCMP_ORD:
1324 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1325 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1326 AMDGPU::V_CMP_O_F64_e64);
1327 case CmpInst::FCMP_UNO:
1328 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1329 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1330 AMDGPU::V_CMP_U_F64_e64);
1331 case CmpInst::FCMP_UEQ:
1332 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1333 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1334 AMDGPU::V_CMP_NLG_F64_e64);
1335 case CmpInst::FCMP_UGT:
1336 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1337 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1338 AMDGPU::V_CMP_NLE_F64_e64);
1339 case CmpInst::FCMP_UGE:
1340 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1341 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1342 AMDGPU::V_CMP_NLT_F64_e64);
1343 case CmpInst::FCMP_ULT:
1344 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1345 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1346 AMDGPU::V_CMP_NGE_F64_e64);
1347 case CmpInst::FCMP_ULE:
1348 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1349 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1350 AMDGPU::V_CMP_NGT_F64_e64);
1351 case CmpInst::FCMP_UNE:
1352 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1353 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1354 AMDGPU::V_CMP_NEQ_F64_e64);
1355 case CmpInst::FCMP_TRUE:
1356 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1357 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1358 AMDGPU::V_CMP_TRU_F64_e64);
1360 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1361 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1362 AMDGPU::V_CMP_F_F64_e64);
1363 }
1364}
1365
1366int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1367 unsigned Size) const {
1368 if (Size == 64) {
1369 if (!STI.hasScalarCompareEq64())
1370 return -1;
1371
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U64;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U64;
1377 default:
1378 return -1;
1379 }
1380 }
1381
1382 if (Size == 32) {
1383 switch (P) {
1384 case CmpInst::ICMP_NE:
1385 return AMDGPU::S_CMP_LG_U32;
1386 case CmpInst::ICMP_EQ:
1387 return AMDGPU::S_CMP_EQ_U32;
1388 case CmpInst::ICMP_SGT:
1389 return AMDGPU::S_CMP_GT_I32;
1390 case CmpInst::ICMP_SGE:
1391 return AMDGPU::S_CMP_GE_I32;
1392 case CmpInst::ICMP_SLT:
1393 return AMDGPU::S_CMP_LT_I32;
1394 case CmpInst::ICMP_SLE:
1395 return AMDGPU::S_CMP_LE_I32;
1396 case CmpInst::ICMP_UGT:
1397 return AMDGPU::S_CMP_GT_U32;
1398 case CmpInst::ICMP_UGE:
1399 return AMDGPU::S_CMP_GE_U32;
1400 case CmpInst::ICMP_ULT:
1401 return AMDGPU::S_CMP_LT_U32;
1402 case CmpInst::ICMP_ULE:
1403 return AMDGPU::S_CMP_LE_U32;
1404 case CmpInst::FCMP_OEQ:
1405 return AMDGPU::S_CMP_EQ_F32;
1406 case CmpInst::FCMP_OGT:
1407 return AMDGPU::S_CMP_GT_F32;
1408 case CmpInst::FCMP_OGE:
1409 return AMDGPU::S_CMP_GE_F32;
1410 case CmpInst::FCMP_OLT:
1411 return AMDGPU::S_CMP_LT_F32;
1412 case CmpInst::FCMP_OLE:
1413 return AMDGPU::S_CMP_LE_F32;
1414 case CmpInst::FCMP_ONE:
1415 return AMDGPU::S_CMP_LG_F32;
1416 case CmpInst::FCMP_ORD:
1417 return AMDGPU::S_CMP_O_F32;
1418 case CmpInst::FCMP_UNO:
1419 return AMDGPU::S_CMP_U_F32;
1420 case CmpInst::FCMP_UEQ:
1421 return AMDGPU::S_CMP_NLG_F32;
1422 case CmpInst::FCMP_UGT:
1423 return AMDGPU::S_CMP_NLE_F32;
1424 case CmpInst::FCMP_UGE:
1425 return AMDGPU::S_CMP_NLT_F32;
1426 case CmpInst::FCMP_ULT:
1427 return AMDGPU::S_CMP_NGE_F32;
1428 case CmpInst::FCMP_ULE:
1429 return AMDGPU::S_CMP_NGT_F32;
1430 case CmpInst::FCMP_UNE:
1431 return AMDGPU::S_CMP_NEQ_F32;
1432 default:
1433 llvm_unreachable("Unknown condition code!");
1434 }
1435 }
1436
1437 if (Size == 16) {
1438 if (!STI.hasSALUFloatInsts())
1439 return -1;
1440
1441 switch (P) {
1442 case CmpInst::FCMP_OEQ:
1443 return AMDGPU::S_CMP_EQ_F16;
1444 case CmpInst::FCMP_OGT:
1445 return AMDGPU::S_CMP_GT_F16;
1446 case CmpInst::FCMP_OGE:
1447 return AMDGPU::S_CMP_GE_F16;
1448 case CmpInst::FCMP_OLT:
1449 return AMDGPU::S_CMP_LT_F16;
1450 case CmpInst::FCMP_OLE:
1451 return AMDGPU::S_CMP_LE_F16;
1452 case CmpInst::FCMP_ONE:
1453 return AMDGPU::S_CMP_LG_F16;
1454 case CmpInst::FCMP_ORD:
1455 return AMDGPU::S_CMP_O_F16;
1456 case CmpInst::FCMP_UNO:
1457 return AMDGPU::S_CMP_U_F16;
1458 case CmpInst::FCMP_UEQ:
1459 return AMDGPU::S_CMP_NLG_F16;
1460 case CmpInst::FCMP_UGT:
1461 return AMDGPU::S_CMP_NLE_F16;
1462 case CmpInst::FCMP_UGE:
1463 return AMDGPU::S_CMP_NLT_F16;
1464 case CmpInst::FCMP_ULT:
1465 return AMDGPU::S_CMP_NGE_F16;
1466 case CmpInst::FCMP_ULE:
1467 return AMDGPU::S_CMP_NGT_F16;
1468 case CmpInst::FCMP_UNE:
1469 return AMDGPU::S_CMP_NEQ_F16;
1470 default:
1471 llvm_unreachable("Unknown condition code!");
1472 }
1473 }
1474
1475 return -1;
1476}
1477
1478bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1479
1480 MachineBasicBlock *BB = I.getParent();
1481 const DebugLoc &DL = I.getDebugLoc();
1482
1483 Register SrcReg = I.getOperand(2).getReg();
1484 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1485
1486 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1487
1488 Register CCReg = I.getOperand(0).getReg();
1489 if (!isVCC(CCReg, *MRI)) {
1490 int Opcode = getS_CMPOpcode(Pred, Size);
1491 if (Opcode == -1)
1492 return false;
1493 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1494 .add(I.getOperand(2))
1495 .add(I.getOperand(3));
1496 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1497 .addReg(AMDGPU::SCC);
1498 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1499 bool Ret =
1500 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1501 I.eraseFromParent();
1502 return Ret;
1503 }
1504
1505 if (I.getOpcode() == AMDGPU::G_FCMP)
1506 return false;
1507
1508 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1509 if (Opcode == -1)
1510 return false;
1511
1512 MachineInstrBuilder ICmp;
1513 // t16 instructions
1514 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1515 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1516 .addImm(0)
1517 .add(I.getOperand(2))
1518 .addImm(0)
1519 .add(I.getOperand(3))
1520 .addImm(0); // op_sel
1521 } else {
1522 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1523 .add(I.getOperand(2))
1524 .add(I.getOperand(3));
1525 }
1526
1527 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1528 *TRI.getBoolRC(), *MRI);
1529 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1530 I.eraseFromParent();
1531 return true;
1532}
1533
1534bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1535 Register Dst = I.getOperand(0).getReg();
1536 if (isVCC(Dst, *MRI))
1537 return false;
1538
1539 LLT DstTy = MRI->getType(Dst);
1540 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1541 return false;
1542
1543 MachineBasicBlock *BB = I.getParent();
1544 const DebugLoc &DL = I.getDebugLoc();
1545 Register SrcReg = I.getOperand(2).getReg();
1546 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1547
1548 // i1 inputs are not supported in GlobalISel.
1549 if (Size == 1)
1550 return false;
1551
1552 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1553 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1554 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1555 I.eraseFromParent();
1556 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1557 }
1558
1559 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1560 if (Opcode == -1)
1561 return false;
1562
1563 MachineInstrBuilder SelectedMI;
1564 MachineOperand &LHS = I.getOperand(2);
1565 MachineOperand &RHS = I.getOperand(3);
1566 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1567 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1568 Register Src0Reg =
1569 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1570 Register Src1Reg =
1571 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1572 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1573 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1574 SelectedMI.addImm(Src0Mods);
1575 SelectedMI.addReg(Src0Reg);
1576 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1577 SelectedMI.addImm(Src1Mods);
1578 SelectedMI.addReg(Src1Reg);
1579 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1580 SelectedMI.addImm(0); // clamp
1581 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1582 SelectedMI.addImm(0); // op_sel
1583
1584 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1585 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1586
1587 I.eraseFromParent();
1588 return true;
1589}
1590
1591// Ballot has to zero bits in input lane-mask that are zero in current exec,
1592// Done as AND with exec. For inputs that are results of instruction that
1593// implicitly use same exec, for example compares in same basic block or SCC to
1594// VCC copy, use copy.
1597 MachineInstr *MI = MRI.getVRegDef(Reg);
1598 if (MI->getParent() != MBB)
1599 return false;
1600
1601 // Lane mask generated by SCC to VCC copy.
1602 if (MI->getOpcode() == AMDGPU::COPY) {
1603 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1604 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1605 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1606 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1607 return true;
1608 }
1609
1610 // Lane mask generated using compare with same exec.
1611 if (isa<GAnyCmp>(MI))
1612 return true;
1613
1614 Register LHS, RHS;
1615 // Look through AND.
1616 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1617 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1619
1620 return false;
1621}
1622
1623bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1624 MachineBasicBlock *BB = I.getParent();
1625 const DebugLoc &DL = I.getDebugLoc();
1626 Register DstReg = I.getOperand(0).getReg();
1627 Register SrcReg = I.getOperand(2).getReg();
1628 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1629 const unsigned WaveSize = STI.getWavefrontSize();
1630
1631 // In the common case, the return type matches the wave size.
1632 // However we also support emitting i64 ballots in wave32 mode.
1633 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1634 return false;
1635
1636 std::optional<ValueAndVReg> Arg =
1638
1639 Register Dst = DstReg;
1640 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1641 if (BallotSize != WaveSize) {
1642 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1643 }
1644
1645 if (Arg) {
1646 const int64_t Value = Arg->Value.getZExtValue();
1647 if (Value == 0) {
1648 // Dst = S_MOV 0
1649 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1650 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1651 } else {
1652 // Dst = COPY EXEC
1653 assert(Value == 1);
1654 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1655 }
1656 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1657 return false;
1658 } else {
1659 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1660 // Dst = COPY SrcReg
1661 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1662 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1663 return false;
1664 } else {
1665 // Dst = S_AND SrcReg, EXEC
1666 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1667 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1668 .addReg(SrcReg)
1669 .addReg(TRI.getExec())
1670 .setOperandDead(3); // Dead scc
1671 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1672 }
1673 }
1674
1675 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1676 if (BallotSize != WaveSize) {
1677 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1678 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1679 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1680 .addReg(Dst)
1681 .addImm(AMDGPU::sub0)
1682 .addReg(HiReg)
1683 .addImm(AMDGPU::sub1);
1684 }
1685
1686 I.eraseFromParent();
1687 return true;
1688}
1689
1690bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1691 Register DstReg = I.getOperand(0).getReg();
1692 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1693 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1694 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1695 return false;
1696
1697 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1698
1699 Module *M = MF->getFunction().getParent();
1700 const MDNode *Metadata = I.getOperand(2).getMetadata();
1701 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1702 auto *RelocSymbol = cast<GlobalVariable>(
1703 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1704
1705 MachineBasicBlock *BB = I.getParent();
1706 BuildMI(*BB, &I, I.getDebugLoc(),
1707 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1709
1710 I.eraseFromParent();
1711 return true;
1712}
1713
1714bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1715 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1716
1717 Register DstReg = I.getOperand(0).getReg();
1718 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1719 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1720 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1721
1722 MachineBasicBlock *MBB = I.getParent();
1723 const DebugLoc &DL = I.getDebugLoc();
1724
1725 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1726
1727 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1728 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1729 MIB.addImm(MFI->getLDSSize());
1730 } else {
1731 Module *M = MF->getFunction().getParent();
1732 const GlobalValue *GV =
1733 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1735 }
1736
1737 I.eraseFromParent();
1738 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1739 return true;
1740}
1741
1742bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1743 MachineBasicBlock *MBB = I.getParent();
1744 MachineFunction &MF = *MBB->getParent();
1745 const DebugLoc &DL = I.getDebugLoc();
1746
1747 MachineOperand &Dst = I.getOperand(0);
1748 Register DstReg = Dst.getReg();
1749 unsigned Depth = I.getOperand(2).getImm();
1750
1751 const TargetRegisterClass *RC
1752 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1753 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1754 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1755 return false;
1756
1757 // Check for kernel and shader functions
1758 if (Depth != 0 ||
1759 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1760 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1761 .addImm(0);
1762 I.eraseFromParent();
1763 return true;
1764 }
1765
1766 MachineFrameInfo &MFI = MF.getFrameInfo();
1767 // There is a call to @llvm.returnaddress in this function
1768 MFI.setReturnAddressIsTaken(true);
1769
1770 // Get the return address reg and mark it as an implicit live-in
1771 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1772 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1773 AMDGPU::SReg_64RegClass, DL);
1774 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1775 .addReg(LiveIn);
1776 I.eraseFromParent();
1777 return true;
1778}
1779
1780bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1781 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1782 // SelectionDAG uses for wave32 vs wave64.
1783 MachineBasicBlock *BB = MI.getParent();
1784 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1785 .add(MI.getOperand(1));
1786
1787 Register Reg = MI.getOperand(1).getReg();
1788 MI.eraseFromParent();
1789
1790 if (!MRI->getRegClassOrNull(Reg))
1791 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1792 return true;
1793}
1794
1795bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1796 MachineInstr &MI, Intrinsic::ID IntrID) const {
1797 MachineBasicBlock *MBB = MI.getParent();
1798 MachineFunction *MF = MBB->getParent();
1799 const DebugLoc &DL = MI.getDebugLoc();
1800
1801 unsigned IndexOperand = MI.getOperand(7).getImm();
1802 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1803 bool WaveDone = MI.getOperand(9).getImm() != 0;
1804
1805 if (WaveDone && !WaveRelease) {
1806 // TODO: Move this to IR verifier
1807 const Function &Fn = MF->getFunction();
1808 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1809 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1810 }
1811
1812 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1813 IndexOperand &= ~0x3f;
1814 unsigned CountDw = 0;
1815
1816 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1817 CountDw = (IndexOperand >> 24) & 0xf;
1818 IndexOperand &= ~(0xf << 24);
1819
1820 if (CountDw < 1 || CountDw > 4) {
1821 const Function &Fn = MF->getFunction();
1822 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1823 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1824 CountDw = 1;
1825 }
1826 }
1827
1828 if (IndexOperand) {
1829 const Function &Fn = MF->getFunction();
1830 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1831 Fn, "ds_ordered_count: bad index operand", DL));
1832 }
1833
1834 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1835 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1836
1837 unsigned Offset0 = OrderedCountIndex << 2;
1838 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1839
1840 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1841 Offset1 |= (CountDw - 1) << 6;
1842
1843 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1844 Offset1 |= ShaderType << 2;
1845
1846 unsigned Offset = Offset0 | (Offset1 << 8);
1847
1848 Register M0Val = MI.getOperand(2).getReg();
1849 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1850 .addReg(M0Val);
1851
1852 Register DstReg = MI.getOperand(0).getReg();
1853 Register ValReg = MI.getOperand(3).getReg();
1854 MachineInstrBuilder DS =
1855 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1856 .addReg(ValReg)
1857 .addImm(Offset)
1858 .cloneMemRefs(MI);
1859
1860 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1861 return false;
1862
1863 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1864 MI.eraseFromParent();
1865 return true;
1866}
1867
1868static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1869 switch (IntrID) {
1870 case Intrinsic::amdgcn_ds_gws_init:
1871 return AMDGPU::DS_GWS_INIT;
1872 case Intrinsic::amdgcn_ds_gws_barrier:
1873 return AMDGPU::DS_GWS_BARRIER;
1874 case Intrinsic::amdgcn_ds_gws_sema_v:
1875 return AMDGPU::DS_GWS_SEMA_V;
1876 case Intrinsic::amdgcn_ds_gws_sema_br:
1877 return AMDGPU::DS_GWS_SEMA_BR;
1878 case Intrinsic::amdgcn_ds_gws_sema_p:
1879 return AMDGPU::DS_GWS_SEMA_P;
1880 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1881 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1882 default:
1883 llvm_unreachable("not a gws intrinsic");
1884 }
1885}
1886
1887bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1888 Intrinsic::ID IID) const {
1889 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1890 !STI.hasGWSSemaReleaseAll()))
1891 return false;
1892
1893 // intrinsic ID, vsrc, offset
1894 const bool HasVSrc = MI.getNumOperands() == 3;
1895 assert(HasVSrc || MI.getNumOperands() == 2);
1896
1897 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1898 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1899 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1900 return false;
1901
1902 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1903 unsigned ImmOffset;
1904
1905 MachineBasicBlock *MBB = MI.getParent();
1906 const DebugLoc &DL = MI.getDebugLoc();
1907
1908 MachineInstr *Readfirstlane = nullptr;
1909
1910 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1911 // incoming offset, in case there's an add of a constant. We'll have to put it
1912 // back later.
1913 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1914 Readfirstlane = OffsetDef;
1915 BaseOffset = OffsetDef->getOperand(1).getReg();
1916 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1917 }
1918
1919 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1920 // If we have a constant offset, try to use the 0 in m0 as the base.
1921 // TODO: Look into changing the default m0 initialization value. If the
1922 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1923 // the immediate offset.
1924
1925 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1926 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1927 .addImm(0);
1928 } else {
1929 std::tie(BaseOffset, ImmOffset) =
1930 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1931
1932 if (Readfirstlane) {
1933 // We have the constant offset now, so put the readfirstlane back on the
1934 // variable component.
1935 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1936 return false;
1937
1938 Readfirstlane->getOperand(1).setReg(BaseOffset);
1939 BaseOffset = Readfirstlane->getOperand(0).getReg();
1940 } else {
1941 if (!RBI.constrainGenericRegister(BaseOffset,
1942 AMDGPU::SReg_32RegClass, *MRI))
1943 return false;
1944 }
1945
1946 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1947 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1948 .addReg(BaseOffset)
1949 .addImm(16)
1950 .setOperandDead(3); // Dead scc
1951
1952 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1953 .addReg(M0Base);
1954 }
1955
1956 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1957 // offset field) % 64. Some versions of the programming guide omit the m0
1958 // part, or claim it's from offset 0.
1959
1960 unsigned Opc = gwsIntrinToOpcode(IID);
1961 const MCInstrDesc &InstrDesc = TII.get(Opc);
1962
1963 if (HasVSrc) {
1964 Register VSrc = MI.getOperand(1).getReg();
1965
1966 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1967 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1968 const TargetRegisterClass *SubRC =
1969 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1970
1971 if (!SubRC) {
1972 // 32-bit normal case.
1973 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1974 return false;
1975
1976 BuildMI(*MBB, &MI, DL, InstrDesc)
1977 .addReg(VSrc)
1978 .addImm(ImmOffset)
1979 .cloneMemRefs(MI);
1980 } else {
1981 // Requires even register alignment, so create 64-bit value and pad the
1982 // top half with undef.
1983 Register DataReg = MRI->createVirtualRegister(DataRC);
1984 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1985 return false;
1986
1987 Register UndefReg = MRI->createVirtualRegister(SubRC);
1988 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1989 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1990 .addReg(VSrc)
1991 .addImm(AMDGPU::sub0)
1992 .addReg(UndefReg)
1993 .addImm(AMDGPU::sub1);
1994
1995 BuildMI(*MBB, &MI, DL, InstrDesc)
1996 .addReg(DataReg)
1997 .addImm(ImmOffset)
1998 .cloneMemRefs(MI);
1999 }
2000 } else {
2001 BuildMI(*MBB, &MI, DL, InstrDesc)
2002 .addImm(ImmOffset)
2003 .cloneMemRefs(MI);
2004 }
2005
2006 MI.eraseFromParent();
2007 return true;
2008}
2009
2010bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2011 bool IsAppend) const {
2012 Register PtrBase = MI.getOperand(2).getReg();
2013 LLT PtrTy = MRI->getType(PtrBase);
2014 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2015
2016 unsigned Offset;
2017 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2018
2019 // TODO: Should this try to look through readfirstlane like GWS?
2020 if (!isDSOffsetLegal(PtrBase, Offset)) {
2021 PtrBase = MI.getOperand(2).getReg();
2022 Offset = 0;
2023 }
2024
2025 MachineBasicBlock *MBB = MI.getParent();
2026 const DebugLoc &DL = MI.getDebugLoc();
2027 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2028
2029 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2030 .addReg(PtrBase);
2031 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2032 return false;
2033
2034 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2035 .addImm(Offset)
2036 .addImm(IsGDS ? -1 : 0)
2037 .cloneMemRefs(MI);
2038 MI.eraseFromParent();
2039 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2040 return true;
2041}
2042
2043bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2044 MachineFunction *MF = MI.getMF();
2045 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2046
2047 MFInfo->setInitWholeWave();
2048 return selectImpl(MI, *CoverageInfo);
2049}
2050
2051static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2052 bool &IsTexFail) {
2053 if (TexFailCtrl)
2054 IsTexFail = true;
2055
2056 TFE = TexFailCtrl & 0x1;
2057 TexFailCtrl &= ~(uint64_t)0x1;
2058 LWE = TexFailCtrl & 0x2;
2059 TexFailCtrl &= ~(uint64_t)0x2;
2060
2061 return TexFailCtrl == 0;
2062}
2063
2064bool AMDGPUInstructionSelector::selectImageIntrinsic(
2065 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2066 MachineBasicBlock *MBB = MI.getParent();
2067 const DebugLoc &DL = MI.getDebugLoc();
2068 unsigned IntrOpcode = Intr->BaseOpcode;
2069
2070 // For image atomic: use no-return opcode if result is unused.
2071 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2072 Register ResultDef = MI.getOperand(0).getReg();
2073 if (MRI->use_nodbg_empty(ResultDef))
2074 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2075 }
2076
2077 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2079
2080 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2081 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2082 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2083 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2084
2085 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2086
2087 Register VDataIn = AMDGPU::NoRegister;
2088 Register VDataOut = AMDGPU::NoRegister;
2089 LLT VDataTy;
2090 int NumVDataDwords = -1;
2091 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2092 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2093
2094 bool Unorm;
2095 if (!BaseOpcode->Sampler)
2096 Unorm = true;
2097 else
2098 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2099
2100 bool TFE;
2101 bool LWE;
2102 bool IsTexFail = false;
2103 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2104 TFE, LWE, IsTexFail))
2105 return false;
2106
2107 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2108 const bool IsA16 = (Flags & 1) != 0;
2109 const bool IsG16 = (Flags & 2) != 0;
2110
2111 // A16 implies 16 bit gradients if subtarget doesn't support G16
2112 if (IsA16 && !STI.hasG16() && !IsG16)
2113 return false;
2114
2115 unsigned DMask = 0;
2116 unsigned DMaskLanes = 0;
2117
2118 if (BaseOpcode->Atomic) {
2119 if (!BaseOpcode->NoReturn)
2120 VDataOut = MI.getOperand(0).getReg();
2121 VDataIn = MI.getOperand(2).getReg();
2122 LLT Ty = MRI->getType(VDataIn);
2123
2124 // Be careful to allow atomic swap on 16-bit element vectors.
2125 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2126 Ty.getSizeInBits() == 128 :
2127 Ty.getSizeInBits() == 64;
2128
2129 if (BaseOpcode->AtomicX2) {
2130 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2131
2132 DMask = Is64Bit ? 0xf : 0x3;
2133 NumVDataDwords = Is64Bit ? 4 : 2;
2134 } else {
2135 DMask = Is64Bit ? 0x3 : 0x1;
2136 NumVDataDwords = Is64Bit ? 2 : 1;
2137 }
2138 } else {
2139 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2140 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2141
2142 if (BaseOpcode->Store) {
2143 VDataIn = MI.getOperand(1).getReg();
2144 VDataTy = MRI->getType(VDataIn);
2145 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2146 } else if (BaseOpcode->NoReturn) {
2147 NumVDataDwords = 0;
2148 } else {
2149 VDataOut = MI.getOperand(0).getReg();
2150 VDataTy = MRI->getType(VDataOut);
2151 NumVDataDwords = DMaskLanes;
2152
2153 if (IsD16 && !STI.hasUnpackedD16VMem())
2154 NumVDataDwords = (DMaskLanes + 1) / 2;
2155 }
2156 }
2157
2158 // Set G16 opcode
2159 if (Subtarget->hasG16() && IsG16) {
2160 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2162 assert(G16MappingInfo);
2163 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2164 }
2165
2166 // TODO: Check this in verifier.
2167 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2168
2169 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2170 // Keep GLC only when the atomic's result is actually used.
2171 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2173 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2175 return false;
2176
2177 int NumVAddrRegs = 0;
2178 int NumVAddrDwords = 0;
2179 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2180 // Skip the $noregs and 0s inserted during legalization.
2181 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2182 if (!AddrOp.isReg())
2183 continue; // XXX - Break?
2184
2185 Register Addr = AddrOp.getReg();
2186 if (!Addr)
2187 break;
2188
2189 ++NumVAddrRegs;
2190 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2191 }
2192
2193 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2194 // NSA, these should have been packed into a single value in the first
2195 // address register
2196 const bool UseNSA =
2197 NumVAddrRegs != 1 &&
2198 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2199 : NumVAddrDwords == NumVAddrRegs);
2200 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2201 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2202 return false;
2203 }
2204
2205 if (IsTexFail)
2206 ++NumVDataDwords;
2207
2208 int Opcode = -1;
2209 if (IsGFX12Plus) {
2210 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2211 NumVDataDwords, NumVAddrDwords);
2212 } else if (IsGFX11Plus) {
2213 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2214 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2215 : AMDGPU::MIMGEncGfx11Default,
2216 NumVDataDwords, NumVAddrDwords);
2217 } else if (IsGFX10Plus) {
2218 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2219 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2220 : AMDGPU::MIMGEncGfx10Default,
2221 NumVDataDwords, NumVAddrDwords);
2222 } else {
2223 if (Subtarget->hasGFX90AInsts()) {
2224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2225 NumVDataDwords, NumVAddrDwords);
2226 if (Opcode == -1) {
2227 LLVM_DEBUG(
2228 dbgs()
2229 << "requested image instruction is not supported on this GPU\n");
2230 return false;
2231 }
2232 }
2233 if (Opcode == -1 &&
2234 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2235 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2236 NumVDataDwords, NumVAddrDwords);
2237 if (Opcode == -1)
2238 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2239 NumVDataDwords, NumVAddrDwords);
2240 }
2241 if (Opcode == -1)
2242 return false;
2243
2244 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2245 .cloneMemRefs(MI);
2246
2247 if (VDataOut) {
2248 if (BaseOpcode->AtomicX2) {
2249 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2250
2251 Register TmpReg = MRI->createVirtualRegister(
2252 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2253 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2254
2255 MIB.addDef(TmpReg);
2256 if (!MRI->use_empty(VDataOut)) {
2257 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2258 .addReg(TmpReg, RegState::Kill, SubReg);
2259 }
2260
2261 } else {
2262 MIB.addDef(VDataOut); // vdata output
2263 }
2264 }
2265
2266 if (VDataIn)
2267 MIB.addReg(VDataIn); // vdata input
2268
2269 for (int I = 0; I != NumVAddrRegs; ++I) {
2270 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2271 if (SrcOp.isReg()) {
2272 assert(SrcOp.getReg() != 0);
2273 MIB.addReg(SrcOp.getReg());
2274 }
2275 }
2276
2277 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2278 if (BaseOpcode->Sampler)
2279 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2280
2281 MIB.addImm(DMask); // dmask
2282
2283 if (IsGFX10Plus)
2284 MIB.addImm(DimInfo->Encoding);
2285 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2286 MIB.addImm(Unorm);
2287
2288 MIB.addImm(CPol);
2289 MIB.addImm(IsA16 && // a16 or r128
2290 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2291 if (IsGFX10Plus)
2292 MIB.addImm(IsA16 ? -1 : 0);
2293
2294 if (!Subtarget->hasGFX90AInsts()) {
2295 MIB.addImm(TFE); // tfe
2296 } else if (TFE) {
2297 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2298 return false;
2299 }
2300
2301 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2302 MIB.addImm(LWE); // lwe
2303 if (!IsGFX10Plus)
2304 MIB.addImm(DimInfo->DA ? -1 : 0);
2305 if (BaseOpcode->HasD16)
2306 MIB.addImm(IsD16 ? -1 : 0);
2307
2308 MI.eraseFromParent();
2309 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2310 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2311 return true;
2312}
2313
2314// We need to handle this here because tablegen doesn't support matching
2315// instructions with multiple outputs.
2316bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2317 MachineInstr &MI) const {
2318 Register Dst0 = MI.getOperand(0).getReg();
2319 Register Dst1 = MI.getOperand(1).getReg();
2320
2321 const DebugLoc &DL = MI.getDebugLoc();
2322 MachineBasicBlock *MBB = MI.getParent();
2323
2324 Register Addr = MI.getOperand(3).getReg();
2325 Register Data0 = MI.getOperand(4).getReg();
2326 Register Data1 = MI.getOperand(5).getReg();
2327 unsigned Offset = MI.getOperand(6).getImm();
2328
2329 unsigned Opc;
2330 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2331 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2332 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2333 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2334 break;
2335 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2336 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2337 break;
2338 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2339 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2340 break;
2341 }
2342
2343 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2344 .addDef(Dst1)
2345 .addUse(Addr)
2346 .addUse(Data0)
2347 .addUse(Data1)
2348 .addImm(Offset)
2349 .cloneMemRefs(MI);
2350
2351 MI.eraseFromParent();
2352 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2353 return true;
2354}
2355
2356bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2357 MachineInstr &I) const {
2358 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2359 switch (IntrinsicID) {
2360 case Intrinsic::amdgcn_end_cf:
2361 return selectEndCfIntrinsic(I);
2362 case Intrinsic::amdgcn_ds_ordered_add:
2363 case Intrinsic::amdgcn_ds_ordered_swap:
2364 return selectDSOrderedIntrinsic(I, IntrinsicID);
2365 case Intrinsic::amdgcn_ds_gws_init:
2366 case Intrinsic::amdgcn_ds_gws_barrier:
2367 case Intrinsic::amdgcn_ds_gws_sema_v:
2368 case Intrinsic::amdgcn_ds_gws_sema_br:
2369 case Intrinsic::amdgcn_ds_gws_sema_p:
2370 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2371 return selectDSGWSIntrinsic(I, IntrinsicID);
2372 case Intrinsic::amdgcn_ds_append:
2373 return selectDSAppendConsume(I, true);
2374 case Intrinsic::amdgcn_ds_consume:
2375 return selectDSAppendConsume(I, false);
2376 case Intrinsic::amdgcn_init_whole_wave:
2377 return selectInitWholeWave(I);
2378 case Intrinsic::amdgcn_raw_buffer_load_lds:
2379 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2380 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2381 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2382 case Intrinsic::amdgcn_struct_buffer_load_lds:
2383 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2384 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2385 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2386 return selectBufferLoadLds(I);
2387 // Until we can store both the address space of the global and the LDS
2388 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2389 // that the argument is a global pointer (buffer pointers have been handled by
2390 // a LLVM IR-level lowering).
2391 case Intrinsic::amdgcn_load_to_lds:
2392 case Intrinsic::amdgcn_load_async_to_lds:
2393 case Intrinsic::amdgcn_global_load_lds:
2394 case Intrinsic::amdgcn_global_load_async_lds:
2395 return selectGlobalLoadLds(I);
2396 case Intrinsic::amdgcn_tensor_load_to_lds:
2397 case Intrinsic::amdgcn_tensor_store_from_lds:
2398 return selectTensorLoadStore(I, IntrinsicID);
2399 case Intrinsic::amdgcn_asyncmark:
2400 case Intrinsic::amdgcn_wait_asyncmark:
2401 if (!Subtarget->hasAsyncMark())
2402 return false;
2403 break;
2404 case Intrinsic::amdgcn_exp_compr:
2405 if (!STI.hasCompressedExport()) {
2406 Function &F = I.getMF()->getFunction();
2407 F.getContext().diagnose(
2408 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2409 I.getDebugLoc(), DS_Error));
2410 return false;
2411 }
2412 break;
2413 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2414 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2415 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2416 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2417 return selectDSBvhStackIntrinsic(I);
2418 case Intrinsic::amdgcn_s_alloc_vgpr: {
2419 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2420 // SCC. We then need to COPY it into the result vreg.
2421 MachineBasicBlock *MBB = I.getParent();
2422 const DebugLoc &DL = I.getDebugLoc();
2423
2424 Register ResReg = I.getOperand(0).getReg();
2425
2426 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2427 .add(I.getOperand(2));
2428 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2429 .addReg(AMDGPU::SCC);
2430 I.eraseFromParent();
2431 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2432 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2433 }
2434 case Intrinsic::amdgcn_s_barrier_init:
2435 case Intrinsic::amdgcn_s_barrier_signal_var:
2436 return selectNamedBarrierInit(I, IntrinsicID);
2437 case Intrinsic::amdgcn_s_wakeup_barrier: {
2438 if (!STI.hasSWakeupBarrier()) {
2439 Function &F = I.getMF()->getFunction();
2440 F.getContext().diagnose(
2441 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2442 I.getDebugLoc(), DS_Error));
2443 return false;
2444 }
2445 return selectNamedBarrierInst(I, IntrinsicID);
2446 }
2447 case Intrinsic::amdgcn_s_barrier_join:
2448 case Intrinsic::amdgcn_s_get_named_barrier_state:
2449 return selectNamedBarrierInst(I, IntrinsicID);
2450 case Intrinsic::amdgcn_s_get_barrier_state:
2451 return selectSGetBarrierState(I, IntrinsicID);
2452 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2453 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2454 }
2455 return selectImpl(I, *CoverageInfo);
2456}
2457
2458bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2459 if (selectImpl(I, *CoverageInfo))
2460 return true;
2461
2462 MachineBasicBlock *BB = I.getParent();
2463 const DebugLoc &DL = I.getDebugLoc();
2464
2465 Register DstReg = I.getOperand(0).getReg();
2466 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2467 assert(Size <= 32 || Size == 64);
2468 const MachineOperand &CCOp = I.getOperand(1);
2469 Register CCReg = CCOp.getReg();
2470 if (!isVCC(CCReg, *MRI)) {
2471 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2472 AMDGPU::S_CSELECT_B32;
2473 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2474 .addReg(CCReg);
2475
2476 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2477 // bank, because it does not cover the register class that we used to represent
2478 // for it. So we need to manually set the register class here.
2479 if (!MRI->getRegClassOrNull(CCReg))
2480 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2481 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2482 .add(I.getOperand(2))
2483 .add(I.getOperand(3));
2484
2486 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2487 I.eraseFromParent();
2488 return true;
2489 }
2490
2491 // Wide VGPR select should have been split in RegBankSelect.
2492 if (Size > 32)
2493 return false;
2494
2495 MachineInstr *Select =
2496 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2497 .addImm(0)
2498 .add(I.getOperand(3))
2499 .addImm(0)
2500 .add(I.getOperand(2))
2501 .add(I.getOperand(1));
2502
2504 I.eraseFromParent();
2505 return true;
2506}
2507
2508bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2509 Register DstReg = I.getOperand(0).getReg();
2510 Register SrcReg = I.getOperand(1).getReg();
2511 const LLT DstTy = MRI->getType(DstReg);
2512 const LLT SrcTy = MRI->getType(SrcReg);
2513 const LLT S1 = LLT::scalar(1);
2514
2515 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2516 const RegisterBank *DstRB;
2517 if (DstTy == S1) {
2518 // This is a special case. We don't treat s1 for legalization artifacts as
2519 // vcc booleans.
2520 DstRB = SrcRB;
2521 } else {
2522 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2523 if (SrcRB != DstRB)
2524 return false;
2525 }
2526
2527 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2528
2529 unsigned DstSize = DstTy.getSizeInBits();
2530 unsigned SrcSize = SrcTy.getSizeInBits();
2531
2532 const TargetRegisterClass *SrcRC =
2533 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2534 const TargetRegisterClass *DstRC =
2535 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2536 if (!SrcRC || !DstRC)
2537 return false;
2538
2539 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2540 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2541 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2542 return false;
2543 }
2544
2545 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2546 assert(STI.useRealTrue16Insts());
2547 const DebugLoc &DL = I.getDebugLoc();
2548 MachineBasicBlock *MBB = I.getParent();
2549 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2550 .addReg(SrcReg, {}, AMDGPU::lo16);
2551 I.eraseFromParent();
2552 return true;
2553 }
2554
2555 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2556 MachineBasicBlock *MBB = I.getParent();
2557 const DebugLoc &DL = I.getDebugLoc();
2558
2559 Register LoReg = MRI->createVirtualRegister(DstRC);
2560 Register HiReg = MRI->createVirtualRegister(DstRC);
2561 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2562 .addReg(SrcReg, {}, AMDGPU::sub0);
2563 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2564 .addReg(SrcReg, {}, AMDGPU::sub1);
2565
2566 if (IsVALU && STI.hasSDWA()) {
2567 // Write the low 16-bits of the high element into the high 16-bits of the
2568 // low element.
2569 MachineInstr *MovSDWA =
2570 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2571 .addImm(0) // $src0_modifiers
2572 .addReg(HiReg) // $src0
2573 .addImm(0) // $clamp
2574 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2575 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2576 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2577 .addReg(LoReg, RegState::Implicit);
2578 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2579 } else {
2580 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2581 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2582 Register ImmReg = MRI->createVirtualRegister(DstRC);
2583 if (IsVALU) {
2584 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2585 .addImm(16)
2586 .addReg(HiReg);
2587 } else {
2588 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2589 .addReg(HiReg)
2590 .addImm(16)
2591 .setOperandDead(3); // Dead scc
2592 }
2593
2594 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2595 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2596 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2597
2598 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2599 .addImm(0xffff);
2600 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2601 .addReg(LoReg)
2602 .addReg(ImmReg);
2603 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2604 .addReg(TmpReg0)
2605 .addReg(TmpReg1);
2606
2607 if (!IsVALU) {
2608 And.setOperandDead(3); // Dead scc
2609 Or.setOperandDead(3); // Dead scc
2610 }
2611 }
2612
2613 I.eraseFromParent();
2614 return true;
2615 }
2616
2617 if (!DstTy.isScalar())
2618 return false;
2619
2620 if (SrcSize > 32) {
2621 unsigned SubRegIdx = DstSize < 32
2622 ? static_cast<unsigned>(AMDGPU::sub0)
2623 : TRI.getSubRegFromChannel(0, DstSize / 32);
2624 if (SubRegIdx == AMDGPU::NoSubRegister)
2625 return false;
2626
2627 // Deal with weird cases where the class only partially supports the subreg
2628 // index.
2629 const TargetRegisterClass *SrcWithSubRC
2630 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2631 if (!SrcWithSubRC)
2632 return false;
2633
2634 if (SrcWithSubRC != SrcRC) {
2635 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2636 return false;
2637 }
2638
2639 I.getOperand(1).setSubReg(SubRegIdx);
2640 }
2641
2642 I.setDesc(TII.get(TargetOpcode::COPY));
2643 return true;
2644}
2645
2646/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2647static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2649 int SignedMask = static_cast<int>(Mask);
2650 return SignedMask >= -16 && SignedMask <= 64;
2651}
2652
2653// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2654const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2655 Register Reg, const MachineRegisterInfo &MRI,
2656 const TargetRegisterInfo &TRI) const {
2657 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2658 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2659 return RB;
2660
2661 // Ignore the type, since we don't use vcc in artifacts.
2662 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2663 return &RBI.getRegBankFromRegClass(*RC, LLT());
2664 return nullptr;
2665}
2666
2667bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2668 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2669 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2670 const DebugLoc &DL = I.getDebugLoc();
2671 MachineBasicBlock &MBB = *I.getParent();
2672 const Register DstReg = I.getOperand(0).getReg();
2673 const Register SrcReg = I.getOperand(1).getReg();
2674
2675 const LLT DstTy = MRI->getType(DstReg);
2676 const LLT SrcTy = MRI->getType(SrcReg);
2677 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2678 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2679 const unsigned DstSize = DstTy.getSizeInBits();
2680 if (!DstTy.isScalar())
2681 return false;
2682
2683 // Artifact casts should never use vcc.
2684 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2685
2686 // FIXME: This should probably be illegal and split earlier.
2687 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2688 if (DstSize <= 32)
2689 return selectCOPY(I);
2690
2691 const TargetRegisterClass *SrcRC =
2692 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2693 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2694 const TargetRegisterClass *DstRC =
2695 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2696
2697 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2698 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2699 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2700 .addReg(SrcReg)
2701 .addImm(AMDGPU::sub0)
2702 .addReg(UndefReg)
2703 .addImm(AMDGPU::sub1);
2704 I.eraseFromParent();
2705
2706 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2707 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2708 }
2709
2710 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2711 // 64-bit should have been split up in RegBankSelect
2712
2713 // Try to use an and with a mask if it will save code size.
2714 unsigned Mask;
2715 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2716 MachineInstr *ExtI =
2717 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2718 .addImm(Mask)
2719 .addReg(SrcReg);
2720 I.eraseFromParent();
2721 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2722 return true;
2723 }
2724
2725 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2726 MachineInstr *ExtI =
2727 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2728 .addReg(SrcReg)
2729 .addImm(0) // Offset
2730 .addImm(SrcSize); // Width
2731 I.eraseFromParent();
2732 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2733 return true;
2734 }
2735
2736 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2737 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2738 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2739 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2740 return false;
2741
2742 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2743 const unsigned SextOpc = SrcSize == 8 ?
2744 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2745 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2746 .addReg(SrcReg);
2747 I.eraseFromParent();
2748 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2749 }
2750
2751 // Using a single 32-bit SALU to calculate the high half is smaller than
2752 // S_BFE with a literal constant operand.
2753 if (DstSize > 32 && SrcSize == 32) {
2754 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2755 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2756 if (Signed) {
2757 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2758 .addReg(SrcReg, {}, SubReg)
2759 .addImm(31)
2760 .setOperandDead(3); // Dead scc
2761 } else {
2762 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2763 .addImm(0);
2764 }
2765 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2766 .addReg(SrcReg, {}, SubReg)
2767 .addImm(AMDGPU::sub0)
2768 .addReg(HiReg)
2769 .addImm(AMDGPU::sub1);
2770 I.eraseFromParent();
2771 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2772 *MRI);
2773 }
2774
2775 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2776 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2777
2778 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2779 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2780 // We need a 64-bit register source, but the high bits don't matter.
2781 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2782 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2783 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2784
2785 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2786 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2787 .addReg(SrcReg, {}, SubReg)
2788 .addImm(AMDGPU::sub0)
2789 .addReg(UndefReg)
2790 .addImm(AMDGPU::sub1);
2791
2792 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2793 .addReg(ExtReg)
2794 .addImm(SrcSize << 16);
2795
2796 I.eraseFromParent();
2797 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2798 }
2799
2800 unsigned Mask;
2801 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2802 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2803 .addReg(SrcReg)
2804 .addImm(Mask)
2805 .setOperandDead(3); // Dead scc
2806 } else {
2807 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2808 .addReg(SrcReg)
2809 .addImm(SrcSize << 16);
2810 }
2811
2812 I.eraseFromParent();
2813 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2814 }
2815
2816 return false;
2817}
2818
2822
2824 Register BitcastSrc;
2825 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2826 Reg = BitcastSrc;
2827 return Reg;
2828}
2829
2831 Register &Out) {
2832 Register Trunc;
2833 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2834 return false;
2835
2836 Register LShlSrc;
2837 Register Cst;
2838 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2839 Cst = stripCopy(Cst, MRI);
2840 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2841 Out = stripBitCast(LShlSrc, MRI);
2842 return true;
2843 }
2844 }
2845
2846 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2847 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2848 return false;
2849
2850 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2851 LLT::fixed_vector(2, 16));
2852
2853 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2854 assert(Mask.size() == 2);
2855
2856 if (Mask[0] == 1 && Mask[1] <= 1) {
2857 Out = Shuffle->getOperand(0).getReg();
2858 return true;
2859 }
2860
2861 return false;
2862}
2863
2864bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2865 if (!Subtarget->hasSALUFloatInsts())
2866 return false;
2867
2868 Register Dst = I.getOperand(0).getReg();
2869 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2870 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2871 return false;
2872
2873 Register Src = I.getOperand(1).getReg();
2874
2875 if (MRI->getType(Dst) == LLT::scalar(32) &&
2876 MRI->getType(Src) == LLT::scalar(16)) {
2877 if (isExtractHiElt(*MRI, Src, Src)) {
2878 MachineBasicBlock *BB = I.getParent();
2879 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2880 .addUse(Src);
2881 I.eraseFromParent();
2882 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2883 }
2884 }
2885
2886 return false;
2887}
2888
2889bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2890 // Only manually handle the f64 SGPR case.
2891 //
2892 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2893 // the bit ops theoretically have a second result due to the implicit def of
2894 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2895 // that is easy by disabling the check. The result works, but uses a
2896 // nonsensical sreg32orlds_and_sreg_1 regclass.
2897 //
2898 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2899 // the variadic REG_SEQUENCE operands.
2900
2901 Register Dst = MI.getOperand(0).getReg();
2902 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2903 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2904 MRI->getType(Dst) != LLT::scalar(64))
2905 return false;
2906
2907 Register Src = MI.getOperand(1).getReg();
2908 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2909 if (Fabs)
2910 Src = Fabs->getOperand(1).getReg();
2911
2912 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2913 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2914 return false;
2915
2916 MachineBasicBlock *BB = MI.getParent();
2917 const DebugLoc &DL = MI.getDebugLoc();
2918 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2919 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2920 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2921 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2922
2923 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2924 .addReg(Src, {}, AMDGPU::sub0);
2925 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2926 .addReg(Src, {}, AMDGPU::sub1);
2927 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2928 .addImm(0x80000000);
2929
2930 // Set or toggle sign bit.
2931 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2932 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2933 .addReg(HiReg)
2934 .addReg(ConstReg)
2935 .setOperandDead(3); // Dead scc
2936 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2937 .addReg(LoReg)
2938 .addImm(AMDGPU::sub0)
2939 .addReg(OpReg)
2940 .addImm(AMDGPU::sub1);
2941 MI.eraseFromParent();
2942 return true;
2943}
2944
2945// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2946bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2947 Register Dst = MI.getOperand(0).getReg();
2948 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2949 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2950 MRI->getType(Dst) != LLT::scalar(64))
2951 return false;
2952
2953 Register Src = MI.getOperand(1).getReg();
2954 MachineBasicBlock *BB = MI.getParent();
2955 const DebugLoc &DL = MI.getDebugLoc();
2956 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2957 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2958 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2959 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2960
2961 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2962 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2963 return false;
2964
2965 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2966 .addReg(Src, {}, AMDGPU::sub0);
2967 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2968 .addReg(Src, {}, AMDGPU::sub1);
2969 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2970 .addImm(0x7fffffff);
2971
2972 // Clear sign bit.
2973 // TODO: Should this used S_BITSET0_*?
2974 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2975 .addReg(HiReg)
2976 .addReg(ConstReg)
2977 .setOperandDead(3); // Dead scc
2978 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2979 .addReg(LoReg)
2980 .addImm(AMDGPU::sub0)
2981 .addReg(OpReg)
2982 .addImm(AMDGPU::sub1);
2983
2984 MI.eraseFromParent();
2985 return true;
2986}
2987
2988static bool isConstant(const MachineInstr &MI) {
2989 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2990}
2991
2992void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2993 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2994
2995 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2996 const MachineInstr *PtrMI =
2997 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2998
2999 assert(PtrMI);
3000
3001 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3002 return;
3003
3004 GEPInfo GEPInfo;
3005
3006 for (unsigned i = 1; i != 3; ++i) {
3007 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3008 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3009 assert(OpDef);
3010 if (i == 2 && isConstant(*OpDef)) {
3011 // TODO: Could handle constant base + variable offset, but a combine
3012 // probably should have commuted it.
3013 assert(GEPInfo.Imm == 0);
3014 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3015 continue;
3016 }
3017 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3018 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3019 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3020 else
3021 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3022 }
3023
3024 AddrInfo.push_back(GEPInfo);
3025 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3026}
3027
3028bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3029 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3030}
3031
3032bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3033 if (!MI.hasOneMemOperand())
3034 return false;
3035
3036 const MachineMemOperand *MMO = *MI.memoperands_begin();
3037 const Value *Ptr = MMO->getValue();
3038
3039 // UndefValue means this is a load of a kernel input. These are uniform.
3040 // Sometimes LDS instructions have constant pointers.
3041 // If Ptr is null, then that means this mem operand contains a
3042 // PseudoSourceValue like GOT.
3044 return true;
3045
3047 return true;
3048
3049 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3050 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3051 AMDGPU::SGPRRegBankID;
3052
3053 const Instruction *I = dyn_cast<Instruction>(Ptr);
3054 return I && I->getMetadata("amdgpu.uniform");
3055}
3056
3057bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3058 for (const GEPInfo &GEPInfo : AddrInfo) {
3059 if (!GEPInfo.VgprParts.empty())
3060 return true;
3061 }
3062 return false;
3063}
3064
3065void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3066 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3067 unsigned AS = PtrTy.getAddressSpace();
3069 STI.ldsRequiresM0Init()) {
3070 MachineBasicBlock *BB = I.getParent();
3071
3072 // If DS instructions require M0 initialization, insert it before selecting.
3073 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3074 .addImm(-1);
3075 }
3076}
3077
3078bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3079 MachineInstr &I) const {
3080 initM0(I);
3081 return selectImpl(I, *CoverageInfo);
3082}
3083
3085 if (Reg.isPhysical())
3086 return false;
3087
3089 const unsigned Opcode = MI.getOpcode();
3090
3091 if (Opcode == AMDGPU::COPY)
3092 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3093
3094 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3095 Opcode == AMDGPU::G_XOR)
3096 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3097 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3098
3099 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3100 return GI->is(Intrinsic::amdgcn_class);
3101
3102 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3103}
3104
3105bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3106 MachineBasicBlock *BB = I.getParent();
3107 MachineOperand &CondOp = I.getOperand(0);
3108 Register CondReg = CondOp.getReg();
3109 const DebugLoc &DL = I.getDebugLoc();
3110
3111 unsigned BrOpcode;
3112 Register CondPhysReg;
3113 const TargetRegisterClass *ConstrainRC;
3114
3115 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3116 // whether the branch is uniform when selecting the instruction. In
3117 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3118 // RegBankSelect knows what it's doing if the branch condition is scc, even
3119 // though it currently does not.
3120 if (!isVCC(CondReg, *MRI)) {
3121 if (MRI->getType(CondReg) != LLT::scalar(32))
3122 return false;
3123
3124 CondPhysReg = AMDGPU::SCC;
3125 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3126 ConstrainRC = &AMDGPU::SReg_32RegClass;
3127 } else {
3128 // FIXME: Should scc->vcc copies and with exec?
3129
3130 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3131 // need to insert an and with exec.
3132 if (!isVCmpResult(CondReg, *MRI)) {
3133 const bool Is64 = STI.isWave64();
3134 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3135 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3136
3137 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3138 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3139 .addReg(CondReg)
3140 .addReg(Exec)
3141 .setOperandDead(3); // Dead scc
3142 CondReg = TmpReg;
3143 }
3144
3145 CondPhysReg = TRI.getVCC();
3146 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3147 ConstrainRC = TRI.getBoolRC();
3148 }
3149
3150 if (!MRI->getRegClassOrNull(CondReg))
3151 MRI->setRegClass(CondReg, ConstrainRC);
3152
3153 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3154 .addReg(CondReg);
3155 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3156 .addMBB(I.getOperand(1).getMBB());
3157
3158 I.eraseFromParent();
3159 return true;
3160}
3161
3162bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3163 MachineInstr &I) const {
3164 Register DstReg = I.getOperand(0).getReg();
3165 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3166 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3167 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3168 if (IsVGPR)
3169 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3170
3171 return RBI.constrainGenericRegister(
3172 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3173}
3174
3175bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3176 Register DstReg = I.getOperand(0).getReg();
3177 Register SrcReg = I.getOperand(1).getReg();
3178 Register MaskReg = I.getOperand(2).getReg();
3179 LLT Ty = MRI->getType(DstReg);
3180 LLT MaskTy = MRI->getType(MaskReg);
3181 MachineBasicBlock *BB = I.getParent();
3182 const DebugLoc &DL = I.getDebugLoc();
3183
3184 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3185 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3186 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3187 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3188 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3189 return false;
3190
3191 // Try to avoid emitting a bit operation when we only need to touch half of
3192 // the 64-bit pointer.
3193 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3194 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3195 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3196
3197 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3198 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3199
3200 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3201 !CanCopyLow32 && !CanCopyHi32) {
3202 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3203 .addReg(SrcReg)
3204 .addReg(MaskReg)
3205 .setOperandDead(3); // Dead scc
3206 I.eraseFromParent();
3207 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3208 return true;
3209 }
3210
3211 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3212 const TargetRegisterClass &RegRC
3213 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3214
3215 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3216 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3217 const TargetRegisterClass *MaskRC =
3218 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3219
3220 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3221 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3222 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3223 return false;
3224
3225 if (Ty.getSizeInBits() == 32) {
3226 assert(MaskTy.getSizeInBits() == 32 &&
3227 "ptrmask should have been narrowed during legalize");
3228
3229 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3230 .addReg(SrcReg)
3231 .addReg(MaskReg);
3232
3233 if (!IsVGPR)
3234 NewOp.setOperandDead(3); // Dead scc
3235 I.eraseFromParent();
3236 return true;
3237 }
3238
3239 Register HiReg = MRI->createVirtualRegister(&RegRC);
3240 Register LoReg = MRI->createVirtualRegister(&RegRC);
3241
3242 // Extract the subregisters from the source pointer.
3243 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3244 .addReg(SrcReg, {}, AMDGPU::sub0);
3245 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3246 .addReg(SrcReg, {}, AMDGPU::sub1);
3247
3248 Register MaskedLo, MaskedHi;
3249
3250 if (CanCopyLow32) {
3251 // If all the bits in the low half are 1, we only need a copy for it.
3252 MaskedLo = LoReg;
3253 } else {
3254 // Extract the mask subregister and apply the and.
3255 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3256 MaskedLo = MRI->createVirtualRegister(&RegRC);
3257
3258 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3259 .addReg(MaskReg, {}, AMDGPU::sub0);
3260 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3261 .addReg(LoReg)
3262 .addReg(MaskLo);
3263 }
3264
3265 if (CanCopyHi32) {
3266 // If all the bits in the high half are 1, we only need a copy for it.
3267 MaskedHi = HiReg;
3268 } else {
3269 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3270 MaskedHi = MRI->createVirtualRegister(&RegRC);
3271
3272 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3273 .addReg(MaskReg, {}, AMDGPU::sub1);
3274 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3275 .addReg(HiReg)
3276 .addReg(MaskHi);
3277 }
3278
3279 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3280 .addReg(MaskedLo)
3281 .addImm(AMDGPU::sub0)
3282 .addReg(MaskedHi)
3283 .addImm(AMDGPU::sub1);
3284 I.eraseFromParent();
3285 return true;
3286}
3287
3288/// Return the register to use for the index value, and the subregister to use
3289/// for the indirectly accessed register.
3290static std::pair<Register, unsigned>
3292 const TargetRegisterClass *SuperRC, Register IdxReg,
3293 unsigned EltSize, GISelValueTracking &ValueTracking) {
3294 Register IdxBaseReg;
3295 int Offset;
3296
3297 std::tie(IdxBaseReg, Offset) =
3298 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3299 if (IdxBaseReg == AMDGPU::NoRegister) {
3300 // This will happen if the index is a known constant. This should ordinarily
3301 // be legalized out, but handle it as a register just in case.
3302 assert(Offset == 0);
3303 IdxBaseReg = IdxReg;
3304 }
3305
3306 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3307
3308 // Skip out of bounds offsets, or else we would end up using an undefined
3309 // register.
3310 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3311 return std::pair(IdxReg, SubRegs[0]);
3312 return std::pair(IdxBaseReg, SubRegs[Offset]);
3313}
3314
3315bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3316 MachineInstr &MI) const {
3317 Register DstReg = MI.getOperand(0).getReg();
3318 Register SrcReg = MI.getOperand(1).getReg();
3319 Register IdxReg = MI.getOperand(2).getReg();
3320
3321 LLT DstTy = MRI->getType(DstReg);
3322 LLT SrcTy = MRI->getType(SrcReg);
3323
3324 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3325 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3326 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3327
3328 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3329 // into a waterfall loop.
3330 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3331 return false;
3332
3333 const TargetRegisterClass *SrcRC =
3334 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3335 const TargetRegisterClass *DstRC =
3336 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3337 if (!SrcRC || !DstRC)
3338 return false;
3339 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3340 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3341 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3342 return false;
3343
3344 MachineBasicBlock *BB = MI.getParent();
3345 const DebugLoc &DL = MI.getDebugLoc();
3346 const bool Is64 = DstTy.getSizeInBits() == 64;
3347
3348 unsigned SubReg;
3349 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3350 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3351
3352 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3353 if (DstTy.getSizeInBits() != 32 && !Is64)
3354 return false;
3355
3356 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3357 .addReg(IdxReg);
3358
3359 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3360 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3361 .addReg(SrcReg, {}, SubReg)
3362 .addReg(SrcReg, RegState::Implicit);
3363 MI.eraseFromParent();
3364 return true;
3365 }
3366
3367 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3368 return false;
3369
3370 if (!STI.useVGPRIndexMode()) {
3371 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3372 .addReg(IdxReg);
3373 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3374 .addReg(SrcReg, {}, SubReg)
3375 .addReg(SrcReg, RegState::Implicit);
3376 MI.eraseFromParent();
3377 return true;
3378 }
3379
3380 const MCInstrDesc &GPRIDXDesc =
3381 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3382 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3383 .addReg(SrcReg)
3384 .addReg(IdxReg)
3385 .addImm(SubReg);
3386
3387 MI.eraseFromParent();
3388 return true;
3389}
3390
3391// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3392bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3393 MachineInstr &MI) const {
3394 Register DstReg = MI.getOperand(0).getReg();
3395 Register VecReg = MI.getOperand(1).getReg();
3396 Register ValReg = MI.getOperand(2).getReg();
3397 Register IdxReg = MI.getOperand(3).getReg();
3398
3399 LLT VecTy = MRI->getType(DstReg);
3400 LLT ValTy = MRI->getType(ValReg);
3401 unsigned VecSize = VecTy.getSizeInBits();
3402 unsigned ValSize = ValTy.getSizeInBits();
3403
3404 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3405 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3406 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3407
3408 assert(VecTy.getElementType() == ValTy);
3409
3410 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3411 // into a waterfall loop.
3412 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3413 return false;
3414
3415 const TargetRegisterClass *VecRC =
3416 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3417 const TargetRegisterClass *ValRC =
3418 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3419
3420 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3421 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3422 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3423 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3424 return false;
3425
3426 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3427 return false;
3428
3429 unsigned SubReg;
3430 std::tie(IdxReg, SubReg) =
3431 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3432
3433 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3434 STI.useVGPRIndexMode();
3435
3436 MachineBasicBlock *BB = MI.getParent();
3437 const DebugLoc &DL = MI.getDebugLoc();
3438
3439 if (!IndexMode) {
3440 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3441 .addReg(IdxReg);
3442
3443 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3444 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3445 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3446 .addReg(VecReg)
3447 .addReg(ValReg)
3448 .addImm(SubReg);
3449 MI.eraseFromParent();
3450 return true;
3451 }
3452
3453 const MCInstrDesc &GPRIDXDesc =
3454 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3455 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3456 .addReg(VecReg)
3457 .addReg(ValReg)
3458 .addReg(IdxReg)
3459 .addImm(SubReg);
3460
3461 MI.eraseFromParent();
3462 return true;
3463}
3464
3465static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3466 switch (Intr) {
3467 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3468 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3469 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3470 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3471 case Intrinsic::amdgcn_load_async_to_lds:
3472 case Intrinsic::amdgcn_global_load_async_lds:
3473 return true;
3474 }
3475 return false;
3476}
3477
3478bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3479 if (!Subtarget->hasVMemToLDSLoad())
3480 return false;
3481 unsigned Opc;
3482 unsigned Size = MI.getOperand(3).getImm();
3483 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3484
3485 // The struct intrinsic variants add one additional operand over raw.
3486 const bool HasVIndex = MI.getNumOperands() == 9;
3487 Register VIndex;
3488 int OpOffset = 0;
3489 if (HasVIndex) {
3490 VIndex = MI.getOperand(4).getReg();
3491 OpOffset = 1;
3492 }
3493
3494 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3495 std::optional<ValueAndVReg> MaybeVOffset =
3497 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3498
3499 switch (Size) {
3500 default:
3501 return false;
3502 case 1:
3503 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3504 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3505 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3506 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3507 break;
3508 case 2:
3509 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3510 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3511 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3512 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3513 break;
3514 case 4:
3515 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3516 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3517 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3518 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3519 break;
3520 case 12:
3521 if (!Subtarget->hasLDSLoadB96_B128())
3522 return false;
3523
3524 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3525 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3526 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3527 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3528 break;
3529 case 16:
3530 if (!Subtarget->hasLDSLoadB96_B128())
3531 return false;
3532
3533 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3534 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3535 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3536 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3537 break;
3538 }
3539
3540 MachineBasicBlock *MBB = MI.getParent();
3541 const DebugLoc &DL = MI.getDebugLoc();
3542 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3543 .add(MI.getOperand(2));
3544
3545 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3546
3547 if (HasVIndex && HasVOffset) {
3548 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3549 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3550 .addReg(VIndex)
3551 .addImm(AMDGPU::sub0)
3552 .addReg(VOffset)
3553 .addImm(AMDGPU::sub1);
3554
3555 MIB.addReg(IdxReg);
3556 } else if (HasVIndex) {
3557 MIB.addReg(VIndex);
3558 } else if (HasVOffset) {
3559 MIB.addReg(VOffset);
3560 }
3561
3562 MIB.add(MI.getOperand(1)); // rsrc
3563 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3564 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3565 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3566 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3567 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3568 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3569 MIB.addImm(
3570 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3571 ? 1
3572 : 0); // swz
3573 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3574
3575 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3576 // Don't set the offset value here because the pointer points to the base of
3577 // the buffer.
3578 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3579
3580 MachinePointerInfo StorePtrI = LoadPtrI;
3581 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3585
3586 auto F = LoadMMO->getFlags() &
3588 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3589 Size, LoadMMO->getBaseAlign());
3590
3591 MachineMemOperand *StoreMMO =
3592 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3593 sizeof(int32_t), LoadMMO->getBaseAlign());
3594
3595 MIB.setMemRefs({LoadMMO, StoreMMO});
3596
3597 MI.eraseFromParent();
3598 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3599 return true;
3600}
3601
3602/// Match a zero extend from a 32-bit value to 64-bits.
3603Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3604 Register ZExtSrc;
3605 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3606 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3607
3608 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3609 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3610 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3611 return Register();
3612
3613 assert(Def->getNumOperands() == 3 &&
3614 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3615 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3616 return Def->getOperand(1).getReg();
3617 }
3618
3619 return Register();
3620}
3621
3622/// Match a sign extend from a 32-bit value to 64-bits.
3623Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3624 Register SExtSrc;
3625 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3626 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3627
3628 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3629 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3630 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3631 return Register();
3632
3633 assert(Def->getNumOperands() == 3 &&
3634 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3635 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3636 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3637 m_SpecificICst(31))))
3638 return Def->getOperand(1).getReg();
3639
3640 if (VT->signBitIsZero(Reg))
3641 return matchZeroExtendFromS32(Reg);
3642
3643 return Register();
3644}
3645
3646/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3647/// is 32-bit.
3649AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3650 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3651 : matchZeroExtendFromS32(Reg);
3652}
3653
3654/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3655/// is 32-bit.
3657AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3658 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3659 : matchSignExtendFromS32(Reg);
3660}
3661
3663AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3664 bool IsSigned) const {
3665 if (IsSigned)
3666 return matchSignExtendFromS32OrS32(Reg);
3667
3668 return matchZeroExtendFromS32OrS32(Reg);
3669}
3670
3671Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3672 Register AnyExtSrc;
3673 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3674 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3675
3676 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3677 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3678 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3679 return Register();
3680
3681 assert(Def->getNumOperands() == 3 &&
3682 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3683
3684 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3685 return Def->getOperand(1).getReg();
3686
3687 return Register();
3688}
3689
3690bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3691 if (!Subtarget->hasVMemToLDSLoad())
3692 return false;
3693
3694 unsigned Opc;
3695 unsigned Size = MI.getOperand(3).getImm();
3696 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3697
3698 switch (Size) {
3699 default:
3700 return false;
3701 case 1:
3702 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3703 break;
3704 case 2:
3705 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3706 break;
3707 case 4:
3708 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3709 break;
3710 case 12:
3711 if (!Subtarget->hasLDSLoadB96_B128())
3712 return false;
3713 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3714 break;
3715 case 16:
3716 if (!Subtarget->hasLDSLoadB96_B128())
3717 return false;
3718 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3719 break;
3720 }
3721
3722 MachineBasicBlock *MBB = MI.getParent();
3723 const DebugLoc &DL = MI.getDebugLoc();
3724 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3725 .add(MI.getOperand(2));
3726
3727 Register Addr = MI.getOperand(1).getReg();
3728 Register VOffset;
3729 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3730 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3731 if (!isSGPR(Addr)) {
3732 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3733 if (isSGPR(AddrDef->Reg)) {
3734 Addr = AddrDef->Reg;
3735 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3736 Register SAddr =
3737 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3738 if (isSGPR(SAddr)) {
3739 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3740 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3741 Addr = SAddr;
3742 VOffset = Off;
3743 }
3744 }
3745 }
3746 }
3747
3748 if (isSGPR(Addr)) {
3750 if (!VOffset) {
3751 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3752 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3753 .addImm(0);
3754 }
3755 }
3756
3757 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3758 .addReg(Addr);
3759
3760 if (isSGPR(Addr))
3761 MIB.addReg(VOffset);
3762
3763 MIB.add(MI.getOperand(4)); // offset
3764
3765 unsigned Aux = MI.getOperand(5).getImm();
3766 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3767 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3768
3769 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3770 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3771 LoadPtrI.Offset = MI.getOperand(4).getImm();
3772 MachinePointerInfo StorePtrI = LoadPtrI;
3773 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3777 auto F = LoadMMO->getFlags() &
3779 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3780 Size, LoadMMO->getBaseAlign());
3781 MachineMemOperand *StoreMMO =
3782 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3783 sizeof(int32_t), Align(4));
3784
3785 MIB.setMemRefs({LoadMMO, StoreMMO});
3786
3787 MI.eraseFromParent();
3788 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3789 return true;
3790}
3791
3792bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3793 Intrinsic::ID IID) const {
3794 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3795 unsigned Opc =
3796 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3797 int NumGroups = 4;
3798
3799 // A lamda function to check whether an operand is a vector of all 0s.
3800 const auto isAllZeros = [&](MachineOperand &Opnd) {
3801 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3802 if (!DefMI)
3803 return false;
3804 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3805 };
3806
3807 // Use _D2 version if both group 2 and 3 are zero-initialized.
3808 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3809 NumGroups = 2;
3810 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3811 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3812 }
3813
3814 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3815 // for now because all existing targets only support up to 4 groups.
3816 MachineBasicBlock *MBB = MI.getParent();
3817 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3818 .add(MI.getOperand(1)) // D# group 0
3819 .add(MI.getOperand(2)); // D# group 1
3820
3821 if (NumGroups >= 4) { // Has at least 4 groups
3822 MIB.add(MI.getOperand(3)) // D# group 2
3823 .add(MI.getOperand(4)); // D# group 3
3824 }
3825
3826 MIB.addImm(0) // r128
3827 .add(MI.getOperand(6)); // cpol
3828
3829 MI.eraseFromParent();
3830 return true;
3831}
3832
3833bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3834 MachineInstr &MI) const {
3835 unsigned OpcodeOpIdx =
3836 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3837 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3838 MI.removeOperand(OpcodeOpIdx);
3839 MI.addImplicitDefUseOperands(*MI.getMF());
3840 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3841 return true;
3842}
3843
3844// FIXME: This should be removed and let the patterns select. We just need the
3845// AGPR/VGPR combination versions.
3846bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3847 unsigned Opc;
3848 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3849 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3850 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3851 break;
3852 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3853 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3854 break;
3855 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3856 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3857 break;
3858 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3859 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3860 break;
3861 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3862 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3863 break;
3864 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3865 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3866 break;
3867 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3868 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3869 break;
3870 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3871 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3872 break;
3873 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3874 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3875 break;
3876 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3877 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3878 break;
3879 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3880 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3881 break;
3882 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3883 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3884 break;
3885 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3886 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3887 break;
3888 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3889 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3890 break;
3891 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3892 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3893 break;
3894 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3895 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3896 break;
3897 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3898 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3899 break;
3900 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3901 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3902 break;
3903 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3904 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3905 break;
3906 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3907 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3908 break;
3909 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3910 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3911 break;
3912 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3913 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3914 break;
3915 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3916 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3917 break;
3918 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3919 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3920 break;
3921 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3922 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3923 break;
3924 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3925 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3926 break;
3927 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3928 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3929 break;
3930 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3931 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3932 break;
3933 default:
3934 llvm_unreachable("unhandled smfmac intrinsic");
3935 }
3936
3937 auto VDst_In = MI.getOperand(4);
3938
3939 MI.setDesc(TII.get(Opc));
3940 MI.removeOperand(4); // VDst_In
3941 MI.removeOperand(1); // Intrinsic ID
3942 MI.addOperand(VDst_In); // Readd VDst_In to the end
3943 MI.addImplicitDefUseOperands(*MI.getMF());
3944 const MCInstrDesc &MCID = MI.getDesc();
3945 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3946 MI.getOperand(0).setIsEarlyClobber(true);
3947 }
3948 return true;
3949}
3950
3951bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3952 MachineInstr &MI, Intrinsic::ID IntrID) const {
3953 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3954 !Subtarget->hasPermlane16Swap())
3955 return false;
3956 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3957 !Subtarget->hasPermlane32Swap())
3958 return false;
3959
3960 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3961 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3962 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3963
3964 MI.removeOperand(2);
3965 MI.setDesc(TII.get(Opcode));
3966 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3967
3968 MachineOperand &FI = MI.getOperand(4);
3970
3971 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3972 return true;
3973}
3974
3975bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3976 Register DstReg = MI.getOperand(0).getReg();
3977 Register SrcReg = MI.getOperand(1).getReg();
3978 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3979 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3980 MachineBasicBlock *MBB = MI.getParent();
3981 const DebugLoc &DL = MI.getDebugLoc();
3982
3983 if (IsVALU) {
3984 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3985 .addImm(Subtarget->getWavefrontSizeLog2())
3986 .addReg(SrcReg);
3987 } else {
3988 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3989 .addReg(SrcReg)
3990 .addImm(Subtarget->getWavefrontSizeLog2())
3991 .setOperandDead(3); // Dead scc
3992 }
3993
3994 const TargetRegisterClass &RC =
3995 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3996 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3997 return false;
3998
3999 MI.eraseFromParent();
4000 return true;
4001}
4002
4003bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4004 MachineInstr &MI) const {
4005 assert(MI.getNumOperands() == 4);
4006 MachineBasicBlock *MBB = MI.getParent();
4007 const DebugLoc &DL = MI.getDebugLoc();
4008
4009 Register DstReg = MI.getOperand(0).getReg();
4010 Register ValReg = MI.getOperand(2).getReg();
4011 Register IdxReg = MI.getOperand(3).getReg();
4012
4013 const LLT DstTy = MRI->getType(DstReg);
4014 unsigned DstSize = DstTy.getSizeInBits();
4015 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4016 const TargetRegisterClass *DstRC =
4017 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4018
4019 if (DstTy != LLT::scalar(32))
4020 return false;
4021
4022 if (!Subtarget->supportsBPermute())
4023 return false;
4024
4025 // If we can bpermute across the whole wave, then just do that
4026 if (Subtarget->supportsWaveWideBPermute()) {
4027 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4028 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4029 .addImm(2)
4030 .addReg(IdxReg);
4031
4032 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4033 .addReg(ShiftIdxReg)
4034 .addReg(ValReg)
4035 .addImm(0);
4036 } else {
4037 // Otherwise, we need to make use of whole wave mode
4038 assert(Subtarget->isWave64());
4039
4040 // Set inactive lanes to poison
4041 Register UndefValReg =
4042 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4043 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4044
4045 Register UndefExecReg = MRI->createVirtualRegister(
4046 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4047 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4048
4049 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4050 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4051 .addImm(0)
4052 .addReg(ValReg)
4053 .addImm(0)
4054 .addReg(UndefValReg)
4055 .addReg(UndefExecReg);
4056
4057 // ds_bpermute requires index to be multiplied by 4
4058 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4059 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4060 .addImm(2)
4061 .addReg(IdxReg);
4062
4063 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4064 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4065 .addImm(0)
4066 .addReg(ShiftIdxReg)
4067 .addImm(0)
4068 .addReg(UndefValReg)
4069 .addReg(UndefExecReg);
4070
4071 // Get permutation of each half, then we'll select which one to use
4072 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4073 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4074 .addReg(PoisonIdxReg)
4075 .addReg(PoisonValReg)
4076 .addImm(0);
4077
4078 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4079 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4080 .addReg(PoisonValReg);
4081
4082 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4083 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4084 .addReg(PoisonIdxReg)
4085 .addReg(SwappedValReg)
4086 .addImm(0);
4087
4088 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4089 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4090 .addReg(OppSidePermReg);
4091
4092 // Select which side to take the permute from
4093 // We can get away with only using mbcnt_lo here since we're only
4094 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4095 // returns 32 for lanes 32-63.
4096 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4097 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4098 .addImm(-1)
4099 .addImm(0);
4100
4101 Register XORReg = MRI->createVirtualRegister(DstRC);
4102 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4103 .addReg(ThreadIDReg)
4104 .addReg(PoisonIdxReg);
4105
4106 Register ANDReg = MRI->createVirtualRegister(DstRC);
4107 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4108 .addReg(XORReg)
4109 .addImm(32);
4110
4111 Register CompareReg = MRI->createVirtualRegister(
4112 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4113 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4114 .addReg(ANDReg)
4115 .addImm(0);
4116
4117 // Finally do the selection
4118 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4119 .addImm(0)
4120 .addReg(WWMSwapPermReg)
4121 .addImm(0)
4122 .addReg(SameSidePermReg)
4123 .addReg(CompareReg);
4124 }
4125
4126 MI.eraseFromParent();
4127 return true;
4128}
4129
4130// Match BITOP3 operation and return a number of matched instructions plus
4131// truth table.
4132static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4134 const MachineRegisterInfo &MRI) {
4135 unsigned NumOpcodes = 0;
4136 uint8_t LHSBits, RHSBits;
4137
4138 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4139 // Define truth table given Src0, Src1, Src2 bits permutations:
4140 // 0 0 0
4141 // 0 0 1
4142 // 0 1 0
4143 // 0 1 1
4144 // 1 0 0
4145 // 1 0 1
4146 // 1 1 0
4147 // 1 1 1
4148 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4149
4150 if (mi_match(Op, MRI, m_AllOnesInt())) {
4151 Bits = 0xff;
4152 return true;
4153 }
4154 if (mi_match(Op, MRI, m_ZeroInt())) {
4155 Bits = 0;
4156 return true;
4157 }
4158
4159 for (unsigned I = 0; I < Src.size(); ++I) {
4160 // Try to find existing reused operand
4161 if (Src[I] == Op) {
4162 Bits = SrcBits[I];
4163 return true;
4164 }
4165 // Try to replace parent operator
4166 if (Src[I] == R) {
4167 Bits = SrcBits[I];
4168 Src[I] = Op;
4169 return true;
4170 }
4171 }
4172
4173 if (Src.size() == 3) {
4174 // No room left for operands. Try one last time, there can be a 'not' of
4175 // one of our source operands. In this case we can compute the bits
4176 // without growing Src vector.
4177 Register LHS;
4178 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4180 for (unsigned I = 0; I < Src.size(); ++I) {
4181 if (Src[I] == LHS) {
4182 Bits = ~SrcBits[I];
4183 return true;
4184 }
4185 }
4186 }
4187
4188 return false;
4189 }
4190
4191 Bits = SrcBits[Src.size()];
4192 Src.push_back(Op);
4193 return true;
4194 };
4195
4196 MachineInstr *MI = MRI.getVRegDef(R);
4197 switch (MI->getOpcode()) {
4198 case TargetOpcode::G_AND:
4199 case TargetOpcode::G_OR:
4200 case TargetOpcode::G_XOR: {
4201 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4202 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4203
4204 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4205 if (!getOperandBits(LHS, LHSBits) ||
4206 !getOperandBits(RHS, RHSBits)) {
4207 Src = std::move(Backup);
4208 return std::make_pair(0, 0);
4209 }
4210
4211 // Recursion is naturally limited by the size of the operand vector.
4212 auto Op = BitOp3_Op(LHS, Src, MRI);
4213 if (Op.first) {
4214 NumOpcodes += Op.first;
4215 LHSBits = Op.second;
4216 }
4217
4218 Op = BitOp3_Op(RHS, Src, MRI);
4219 if (Op.first) {
4220 NumOpcodes += Op.first;
4221 RHSBits = Op.second;
4222 }
4223 break;
4224 }
4225 default:
4226 return std::make_pair(0, 0);
4227 }
4228
4229 uint8_t TTbl;
4230 switch (MI->getOpcode()) {
4231 case TargetOpcode::G_AND:
4232 TTbl = LHSBits & RHSBits;
4233 break;
4234 case TargetOpcode::G_OR:
4235 TTbl = LHSBits | RHSBits;
4236 break;
4237 case TargetOpcode::G_XOR:
4238 TTbl = LHSBits ^ RHSBits;
4239 break;
4240 default:
4241 break;
4242 }
4243
4244 return std::make_pair(NumOpcodes + 1, TTbl);
4245}
4246
4247bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4248 if (!Subtarget->hasBitOp3Insts())
4249 return false;
4250
4251 Register DstReg = MI.getOperand(0).getReg();
4252 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4253 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4254 if (!IsVALU)
4255 return false;
4256
4258 uint8_t TTbl;
4259 unsigned NumOpcodes;
4260
4261 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4262
4263 // Src.empty() case can happen if all operands are all zero or all ones.
4264 // Normally it shall be optimized out before reaching this.
4265 if (NumOpcodes < 2 || Src.empty())
4266 return false;
4267
4268 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4269 if (NumOpcodes == 2 && IsB32) {
4270 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4271 // asm more readable. This cannot be modeled with AddedComplexity because
4272 // selector does not know how many operations did we match.
4273 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4274 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4275 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4276 return false;
4277 } else if (NumOpcodes < 4) {
4278 // For a uniform case threshold should be higher to account for moves
4279 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4280 // in SGPRs and a readtfirstlane after.
4281 return false;
4282 }
4283
4284 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4285 if (!IsB32 && STI.hasTrue16BitInsts())
4286 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4287 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4288 unsigned CBL = STI.getConstantBusLimit(Opc);
4289 MachineBasicBlock *MBB = MI.getParent();
4290 const DebugLoc &DL = MI.getDebugLoc();
4291
4292 for (unsigned I = 0; I < Src.size(); ++I) {
4293 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4294 if (RB->getID() != AMDGPU::SGPRRegBankID)
4295 continue;
4296 if (CBL > 0) {
4297 --CBL;
4298 continue;
4299 }
4300 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4301 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4302 .addReg(Src[I]);
4303 Src[I] = NewReg;
4304 }
4305
4306 // Last operand can be ignored, turning a ternary operation into a binary.
4307 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4308 // 'c' with 'a' here without changing the answer. In some pathological
4309 // cases it should be possible to get an operation with a single operand
4310 // too if optimizer would not catch it.
4311 while (Src.size() < 3)
4312 Src.push_back(Src[0]);
4313
4314 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4315 if (!IsB32)
4316 MIB.addImm(0); // src_mod0
4317 MIB.addReg(Src[0]);
4318 if (!IsB32)
4319 MIB.addImm(0); // src_mod1
4320 MIB.addReg(Src[1]);
4321 if (!IsB32)
4322 MIB.addImm(0); // src_mod2
4323 MIB.addReg(Src[2])
4324 .addImm(TTbl);
4325 if (!IsB32)
4326 MIB.addImm(0); // op_sel
4327
4328 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4329 MI.eraseFromParent();
4330
4331 return true;
4332}
4333
4334bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4335 Register SrcReg = MI.getOperand(0).getReg();
4336 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4337 return false;
4338
4339 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4340 Register SP =
4341 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4342 Register WaveAddr = getWaveAddress(DefMI);
4343 MachineBasicBlock *MBB = MI.getParent();
4344 const DebugLoc &DL = MI.getDebugLoc();
4345
4346 if (!WaveAddr) {
4347 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4348 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4349 .addReg(SrcReg)
4350 .addImm(Subtarget->getWavefrontSizeLog2())
4351 .setOperandDead(3); // Dead scc
4352 }
4353
4354 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4355 .addReg(WaveAddr);
4356
4357 MI.eraseFromParent();
4358 return true;
4359}
4360
4362
4363 if (!I.isPreISelOpcode()) {
4364 if (I.isCopy())
4365 return selectCOPY(I);
4366 return true;
4367 }
4368
4369 switch (I.getOpcode()) {
4370 case TargetOpcode::G_AND:
4371 case TargetOpcode::G_OR:
4372 case TargetOpcode::G_XOR:
4373 if (selectBITOP3(I))
4374 return true;
4375 if (selectImpl(I, *CoverageInfo))
4376 return true;
4377 return selectG_AND_OR_XOR(I);
4378 case TargetOpcode::G_ADD:
4379 case TargetOpcode::G_SUB:
4380 case TargetOpcode::G_PTR_ADD:
4381 if (selectImpl(I, *CoverageInfo))
4382 return true;
4383 return selectG_ADD_SUB(I);
4384 case TargetOpcode::G_UADDO:
4385 case TargetOpcode::G_USUBO:
4386 case TargetOpcode::G_UADDE:
4387 case TargetOpcode::G_USUBE:
4388 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4389 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4390 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4391 return selectG_AMDGPU_MAD_64_32(I);
4392 case TargetOpcode::G_INTTOPTR:
4393 case TargetOpcode::G_BITCAST:
4394 case TargetOpcode::G_PTRTOINT:
4395 case TargetOpcode::G_FREEZE:
4396 return selectCOPY(I);
4397 case TargetOpcode::G_FNEG:
4398 if (selectImpl(I, *CoverageInfo))
4399 return true;
4400 return selectG_FNEG(I);
4401 case TargetOpcode::G_FABS:
4402 if (selectImpl(I, *CoverageInfo))
4403 return true;
4404 return selectG_FABS(I);
4405 case TargetOpcode::G_EXTRACT:
4406 return selectG_EXTRACT(I);
4407 case TargetOpcode::G_MERGE_VALUES:
4408 case TargetOpcode::G_CONCAT_VECTORS:
4409 return selectG_MERGE_VALUES(I);
4410 case TargetOpcode::G_UNMERGE_VALUES:
4411 return selectG_UNMERGE_VALUES(I);
4412 case TargetOpcode::G_BUILD_VECTOR:
4413 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4414 return selectG_BUILD_VECTOR(I);
4415 case TargetOpcode::G_IMPLICIT_DEF:
4416 return selectG_IMPLICIT_DEF(I);
4417 case TargetOpcode::G_INSERT:
4418 return selectG_INSERT(I);
4419 case TargetOpcode::G_INTRINSIC:
4420 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4421 return selectG_INTRINSIC(I);
4422 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4423 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4424 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4425 case TargetOpcode::G_ICMP:
4426 case TargetOpcode::G_FCMP:
4427 if (selectG_ICMP_or_FCMP(I))
4428 return true;
4429 return selectImpl(I, *CoverageInfo);
4430 case TargetOpcode::G_LOAD:
4431 case TargetOpcode::G_ZEXTLOAD:
4432 case TargetOpcode::G_SEXTLOAD:
4433 case TargetOpcode::G_STORE:
4434 case TargetOpcode::G_ATOMIC_CMPXCHG:
4435 case TargetOpcode::G_ATOMICRMW_XCHG:
4436 case TargetOpcode::G_ATOMICRMW_ADD:
4437 case TargetOpcode::G_ATOMICRMW_SUB:
4438 case TargetOpcode::G_ATOMICRMW_AND:
4439 case TargetOpcode::G_ATOMICRMW_OR:
4440 case TargetOpcode::G_ATOMICRMW_XOR:
4441 case TargetOpcode::G_ATOMICRMW_MIN:
4442 case TargetOpcode::G_ATOMICRMW_MAX:
4443 case TargetOpcode::G_ATOMICRMW_UMIN:
4444 case TargetOpcode::G_ATOMICRMW_UMAX:
4445 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4446 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4447 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4448 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4449 case TargetOpcode::G_ATOMICRMW_FADD:
4450 case TargetOpcode::G_ATOMICRMW_FMIN:
4451 case TargetOpcode::G_ATOMICRMW_FMAX:
4452 return selectG_LOAD_STORE_ATOMICRMW(I);
4453 case TargetOpcode::G_SELECT:
4454 return selectG_SELECT(I);
4455 case TargetOpcode::G_TRUNC:
4456 return selectG_TRUNC(I);
4457 case TargetOpcode::G_SEXT:
4458 case TargetOpcode::G_ZEXT:
4459 case TargetOpcode::G_ANYEXT:
4460 case TargetOpcode::G_SEXT_INREG:
4461 // This is a workaround. For extension from type i1, `selectImpl()` uses
4462 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4463 // i1 can only be hold in a SGPR class.
4464 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4465 selectImpl(I, *CoverageInfo))
4466 return true;
4467 return selectG_SZA_EXT(I);
4468 case TargetOpcode::G_FPEXT:
4469 if (selectG_FPEXT(I))
4470 return true;
4471 return selectImpl(I, *CoverageInfo);
4472 case TargetOpcode::G_BRCOND:
4473 return selectG_BRCOND(I);
4474 case TargetOpcode::G_GLOBAL_VALUE:
4475 return selectG_GLOBAL_VALUE(I);
4476 case TargetOpcode::G_PTRMASK:
4477 return selectG_PTRMASK(I);
4478 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4479 return selectG_EXTRACT_VECTOR_ELT(I);
4480 case TargetOpcode::G_INSERT_VECTOR_ELT:
4481 return selectG_INSERT_VECTOR_ELT(I);
4482 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4483 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4484 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4485 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4486 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4487 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4489 assert(Intr && "not an image intrinsic with image pseudo");
4490 return selectImageIntrinsic(I, Intr);
4491 }
4492 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4493 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4494 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4495 return selectBVHIntersectRayIntrinsic(I);
4496 case AMDGPU::G_SBFX:
4497 case AMDGPU::G_UBFX:
4498 return selectG_SBFX_UBFX(I);
4499 case AMDGPU::G_SI_CALL:
4500 I.setDesc(TII.get(AMDGPU::SI_CALL));
4501 return true;
4502 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4503 return selectWaveAddress(I);
4504 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4505 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4506 return true;
4507 }
4508 case AMDGPU::G_STACKRESTORE:
4509 return selectStackRestore(I);
4510 case AMDGPU::G_PHI:
4511 return selectPHI(I);
4512 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4513 return selectCOPY_SCC_VCC(I);
4514 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4515 return selectCOPY_VCC_SCC(I);
4516 case AMDGPU::G_AMDGPU_READANYLANE:
4517 return selectReadAnyLane(I);
4518 case TargetOpcode::G_CONSTANT:
4519 case TargetOpcode::G_FCONSTANT:
4520 default:
4521 return selectImpl(I, *CoverageInfo);
4522 }
4523 return false;
4524}
4525
4527AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4528 return {{
4529 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4530 }};
4531
4532}
4533
4534std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4535 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4536 unsigned Mods = 0;
4537 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4538
4539 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4540 Src = MI->getOperand(1).getReg();
4541 Mods |= SISrcMods::NEG;
4542 MI = getDefIgnoringCopies(Src, *MRI);
4543 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4544 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4545 // denormal mode, but we're implicitly canonicalizing in a source operand.
4546 const ConstantFP *LHS =
4547 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4548 if (LHS && LHS->isZero()) {
4549 Mods |= SISrcMods::NEG;
4550 Src = MI->getOperand(2).getReg();
4551 }
4552 }
4553
4554 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4555 Src = MI->getOperand(1).getReg();
4556 Mods |= SISrcMods::ABS;
4557 }
4558
4559 if (OpSel)
4560 Mods |= SISrcMods::OP_SEL_0;
4561
4562 return std::pair(Src, Mods);
4563}
4564
4565std::pair<Register, unsigned>
4566AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4567 unsigned Mods;
4568 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4569 Mods |= SISrcMods::OP_SEL_1;
4570 return std::pair(Src, Mods);
4571}
4572
4573Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4574 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4575 bool ForceVGPR) const {
4576 if ((Mods != 0 || ForceVGPR) &&
4577 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4578
4579 // If we looked through copies to find source modifiers on an SGPR operand,
4580 // we now have an SGPR register source. To avoid potentially violating the
4581 // constant bus restriction, we need to insert a copy to a VGPR.
4582 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4583 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4584 TII.get(AMDGPU::COPY), VGPRSrc)
4585 .addReg(Src);
4586 Src = VGPRSrc;
4587 }
4588
4589 return Src;
4590}
4591
4592///
4593/// This will select either an SGPR or VGPR operand and will save us from
4594/// having to write an extra tablegen pattern.
4596AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4597 return {{
4598 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4599 }};
4600}
4601
4603AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4604 Register Src;
4605 unsigned Mods;
4606 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4607
4608 return {{
4609 [=](MachineInstrBuilder &MIB) {
4610 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4611 },
4612 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4613 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4614 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4615 }};
4616}
4617
4619AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4620 Register Src;
4621 unsigned Mods;
4622 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4623 /*IsCanonicalizing=*/true,
4624 /*AllowAbs=*/false);
4625
4626 return {{
4627 [=](MachineInstrBuilder &MIB) {
4628 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4629 },
4630 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4631 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4632 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4633 }};
4634}
4635
4637AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4638 return {{
4639 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4640 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4641 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4642 }};
4643}
4644
4646AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4647 Register Src;
4648 unsigned Mods;
4649 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4650
4651 return {{
4652 [=](MachineInstrBuilder &MIB) {
4653 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4654 },
4655 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4656 }};
4657}
4658
4660AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4661 MachineOperand &Root) const {
4662 Register Src;
4663 unsigned Mods;
4664 std::tie(Src, Mods) =
4665 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4666
4667 return {{
4668 [=](MachineInstrBuilder &MIB) {
4669 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4670 },
4671 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4672 }};
4673}
4674
4676AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4677 Register Src;
4678 unsigned Mods;
4679 std::tie(Src, Mods) =
4680 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4681 /*AllowAbs=*/false);
4682
4683 return {{
4684 [=](MachineInstrBuilder &MIB) {
4685 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4686 },
4687 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4688 }};
4689}
4690
4692AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4693 Register Reg = Root.getReg();
4694 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4695 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4696 return {};
4697 return {{
4698 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4699 }};
4700}
4701
4702enum class SrcStatus {
4707 // This means current op = [op_upper, op_lower] and src = -op_lower.
4710 // This means current op = [op_upper, op_lower] and src = [op_upper,
4711 // -op_lower].
4719};
4720/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4721static bool isTruncHalf(const MachineInstr *MI,
4722 const MachineRegisterInfo &MRI) {
4723 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4724 return false;
4725
4726 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4727 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4728 return DstSize * 2 == SrcSize;
4729}
4730
4731/// Test if the MI is logic shift right with half bits,
4732/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4733static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4734 if (MI->getOpcode() != AMDGPU::G_LSHR)
4735 return false;
4736
4737 Register ShiftSrc;
4738 std::optional<ValueAndVReg> ShiftAmt;
4739 if (mi_match(MI->getOperand(0).getReg(), MRI,
4740 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4741 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4742 unsigned Shift = ShiftAmt->Value.getZExtValue();
4743 return Shift * 2 == SrcSize;
4744 }
4745 return false;
4746}
4747
4748/// Test if the MI is shift left with half bits,
4749/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4750static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4751 if (MI->getOpcode() != AMDGPU::G_SHL)
4752 return false;
4753
4754 Register ShiftSrc;
4755 std::optional<ValueAndVReg> ShiftAmt;
4756 if (mi_match(MI->getOperand(0).getReg(), MRI,
4757 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4758 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4759 unsigned Shift = ShiftAmt->Value.getZExtValue();
4760 return Shift * 2 == SrcSize;
4761 }
4762 return false;
4763}
4764
4765/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4766static bool isUnmergeHalf(const MachineInstr *MI,
4767 const MachineRegisterInfo &MRI) {
4768 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4769 return false;
4770 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4771 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4772}
4773
4775
4777 const MachineRegisterInfo &MRI) {
4778 LLT OpTy = MRI.getType(Reg);
4779 if (OpTy.isScalar())
4780 return TypeClass::SCALAR;
4781 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4784}
4785
4787 const MachineRegisterInfo &MRI) {
4788 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4789 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4790 return SrcStatus::INVALID;
4791
4792 switch (S) {
4793 case SrcStatus::IS_SAME:
4794 if (NegType == TypeClass::VECTOR_OF_TWO) {
4795 // Vector of 2:
4796 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4797 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4798 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4799 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4801 }
4802 if (NegType == TypeClass::SCALAR) {
4803 // Scalar:
4804 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4805 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4806 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4807 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4808 return SrcStatus::IS_HI_NEG;
4809 }
4810 break;
4812 if (NegType == TypeClass::VECTOR_OF_TWO) {
4813 // Vector of 2:
4814 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4815 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4816 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4817 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4818 return SrcStatus::IS_LO_NEG;
4819 }
4820 if (NegType == TypeClass::SCALAR) {
4821 // Scalar:
4822 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4823 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4824 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4825 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4826 return SrcStatus::IS_SAME;
4827 }
4828 break;
4830 if (NegType == TypeClass::VECTOR_OF_TWO) {
4831 // Vector of 2:
4832 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4833 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4834 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4835 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4836 return SrcStatus::IS_HI_NEG;
4837 }
4838 if (NegType == TypeClass::SCALAR) {
4839 // Scalar:
4840 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4841 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4842 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4843 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4845 }
4846 break;
4848 if (NegType == TypeClass::VECTOR_OF_TWO) {
4849 // Vector of 2:
4850 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4851 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4852 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4853 // [SrcHi, SrcLo] = [OpHi, OpLo]
4854 return SrcStatus::IS_SAME;
4855 }
4856 if (NegType == TypeClass::SCALAR) {
4857 // Scalar:
4858 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4859 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4860 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4861 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4862 return SrcStatus::IS_LO_NEG;
4863 }
4864 break;
4866 // Vector of 2:
4867 // Src = CurrUpper
4868 // Curr = [CurrUpper, CurrLower]
4869 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4870 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4871 // Src = -OpUpper
4872 //
4873 // Scalar:
4874 // Src = CurrUpper
4875 // Curr = [CurrUpper, CurrLower]
4876 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4877 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4878 // Src = -OpUpper
4881 if (NegType == TypeClass::VECTOR_OF_TWO) {
4882 // Vector of 2:
4883 // Src = CurrLower
4884 // Curr = [CurrUpper, CurrLower]
4885 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4886 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4887 // Src = -OpLower
4889 }
4890 if (NegType == TypeClass::SCALAR) {
4891 // Scalar:
4892 // Src = CurrLower
4893 // Curr = [CurrUpper, CurrLower]
4894 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4895 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4896 // Src = OpLower
4898 }
4899 break;
4901 // Vector of 2:
4902 // Src = -CurrUpper
4903 // Curr = [CurrUpper, CurrLower]
4904 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4905 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4906 // Src = -(-OpUpper) = OpUpper
4907 //
4908 // Scalar:
4909 // Src = -CurrUpper
4910 // Curr = [CurrUpper, CurrLower]
4911 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4912 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4913 // Src = -(-OpUpper) = OpUpper
4916 if (NegType == TypeClass::VECTOR_OF_TWO) {
4917 // Vector of 2:
4918 // Src = -CurrLower
4919 // Curr = [CurrUpper, CurrLower]
4920 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4921 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4922 // Src = -(-OpLower) = OpLower
4924 }
4925 if (NegType == TypeClass::SCALAR) {
4926 // Scalar:
4927 // Src = -CurrLower
4928 // Curr = [CurrUpper, CurrLower]
4929 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4930 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4931 // Src = -OpLower
4933 }
4934 break;
4935 default:
4936 break;
4937 }
4938 llvm_unreachable("unexpected SrcStatus & NegType combination");
4939}
4940
4941static std::optional<std::pair<Register, SrcStatus>>
4942calcNextStatus(std::pair<Register, SrcStatus> Curr,
4943 const MachineRegisterInfo &MRI) {
4944 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4945
4946 unsigned Opc = MI->getOpcode();
4947
4948 // Handle general Opc cases.
4949 switch (Opc) {
4950 case AMDGPU::G_BITCAST:
4951 return std::optional<std::pair<Register, SrcStatus>>(
4952 {MI->getOperand(1).getReg(), Curr.second});
4953 case AMDGPU::COPY:
4954 if (MI->getOperand(1).getReg().isPhysical())
4955 return std::nullopt;
4956 return std::optional<std::pair<Register, SrcStatus>>(
4957 {MI->getOperand(1).getReg(), Curr.second});
4958 case AMDGPU::G_FNEG: {
4959 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4960 if (Stat == SrcStatus::INVALID)
4961 return std::nullopt;
4962 return std::optional<std::pair<Register, SrcStatus>>(
4963 {MI->getOperand(1).getReg(), Stat});
4964 }
4965 default:
4966 break;
4967 }
4968
4969 // Calc next Stat from current Stat.
4970 switch (Curr.second) {
4971 case SrcStatus::IS_SAME:
4972 if (isTruncHalf(MI, MRI))
4973 return std::optional<std::pair<Register, SrcStatus>>(
4974 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4975 else if (isUnmergeHalf(MI, MRI)) {
4976 if (Curr.first == MI->getOperand(0).getReg())
4977 return std::optional<std::pair<Register, SrcStatus>>(
4978 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4979 return std::optional<std::pair<Register, SrcStatus>>(
4980 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4981 }
4982 break;
4984 if (isTruncHalf(MI, MRI)) {
4985 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4986 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4987 // = [OpLowerHi, OpLowerLo]
4988 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4989 // = [-OpLowerHi, OpLowerLo]
4990 // = -OpLower
4991 return std::optional<std::pair<Register, SrcStatus>>(
4992 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4993 }
4994 if (isUnmergeHalf(MI, MRI)) {
4995 if (Curr.first == MI->getOperand(0).getReg())
4996 return std::optional<std::pair<Register, SrcStatus>>(
4997 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4998 return std::optional<std::pair<Register, SrcStatus>>(
4999 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5000 }
5001 break;
5003 if (isShlHalf(MI, MRI))
5004 return std::optional<std::pair<Register, SrcStatus>>(
5005 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5006 break;
5008 if (isLshrHalf(MI, MRI))
5009 return std::optional<std::pair<Register, SrcStatus>>(
5010 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5011 break;
5013 if (isShlHalf(MI, MRI))
5014 return std::optional<std::pair<Register, SrcStatus>>(
5015 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5016 break;
5018 if (isLshrHalf(MI, MRI))
5019 return std::optional<std::pair<Register, SrcStatus>>(
5020 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5021 break;
5022 default:
5023 break;
5024 }
5025 return std::nullopt;
5026}
5027
5028/// This is used to control valid status that current MI supports. For example,
5029/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5030/// bit on VOP3P.
5031/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5032/// for different MI on different arch
5034private:
5035 bool HasNeg = false;
5036 // Assume all complex pattern of VOP3P have opsel.
5037 bool HasOpsel = true;
5038
5039public:
5041 const MachineInstr *MI = MRI.getVRegDef(Reg);
5042 unsigned Opc = MI->getOpcode();
5043
5044 if (Opc == TargetOpcode::G_INTRINSIC) {
5045 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5046 // Only float point intrinsic has neg & neg_hi bits.
5047 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5048 HasNeg = true;
5050 // Keep same for generic op.
5051 HasNeg = true;
5052 }
5053 }
5054 bool checkOptions(SrcStatus Stat) const {
5055 if (!HasNeg &&
5056 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5057 return false;
5058 }
5059 if (!HasOpsel &&
5060 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5061 return false;
5062 }
5063 return true;
5064 }
5065};
5066
5069 int MaxDepth = 3) {
5070 int Depth = 0;
5071 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5073
5074 while (Depth <= MaxDepth && Curr.has_value()) {
5075 Depth++;
5076 if (SO.checkOptions(Curr.value().second))
5077 Statlist.push_back(Curr.value());
5078 Curr = calcNextStatus(Curr.value(), MRI);
5079 }
5080
5081 return Statlist;
5082}
5083
5084static std::pair<Register, SrcStatus>
5086 int MaxDepth = 3) {
5087 int Depth = 0;
5088 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5089 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5090
5091 while (Depth <= MaxDepth && Curr.has_value()) {
5092 Depth++;
5093 SrcStatus Stat = Curr.value().second;
5094 if (SO.checkOptions(Stat)) {
5095 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5097 LastSameOrNeg = Curr.value();
5098 }
5099 Curr = calcNextStatus(Curr.value(), MRI);
5100 }
5101
5102 return LastSameOrNeg;
5103}
5104
5105static bool isSameBitWidth(Register Reg1, Register Reg2,
5106 const MachineRegisterInfo &MRI) {
5107 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5108 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5109 return Width1 == Width2;
5110}
5111
5112static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5113 // SrcStatus::IS_LOWER_HALF remain 0.
5114 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5115 Mods ^= SISrcMods::NEG_HI;
5116 Mods |= SISrcMods::OP_SEL_1;
5117 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5118 Mods |= SISrcMods::OP_SEL_1;
5119 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5120 Mods ^= SISrcMods::NEG_HI;
5121 else if (HiStat == SrcStatus::IS_HI_NEG)
5122 Mods ^= SISrcMods::NEG_HI;
5123
5124 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5125 Mods ^= SISrcMods::NEG;
5126 Mods |= SISrcMods::OP_SEL_0;
5127 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5128 Mods |= SISrcMods::OP_SEL_0;
5129 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5130 Mods |= SISrcMods::NEG;
5131 else if (LoStat == SrcStatus::IS_HI_NEG)
5132 Mods ^= SISrcMods::NEG;
5133
5134 return Mods;
5135}
5136
5137static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5138 Register RootReg, const SIInstrInfo &TII,
5139 const MachineRegisterInfo &MRI) {
5140 auto IsHalfState = [](SrcStatus S) {
5143 };
5144 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5145 IsHalfState(HiStat);
5146}
5147
5148std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5149 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5150 unsigned Mods = 0;
5151 // No modification if Root type is not form of <2 x Type>.
5152 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5153 Mods |= SISrcMods::OP_SEL_1;
5154 return {RootReg, Mods};
5155 }
5156
5157 SearchOptions SO(RootReg, MRI);
5158
5159 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5160
5161 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5163 else if (Stat.second == SrcStatus::IS_HI_NEG)
5164 Mods ^= SISrcMods::NEG_HI;
5165 else if (Stat.second == SrcStatus::IS_LO_NEG)
5166 Mods ^= SISrcMods::NEG;
5167
5168 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5169
5170 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5171 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5172 Mods |= SISrcMods::OP_SEL_1;
5173 return {Stat.first, Mods};
5174 }
5175
5177 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5178
5179 if (StatlistHi.empty()) {
5180 Mods |= SISrcMods::OP_SEL_1;
5181 return {Stat.first, Mods};
5182 }
5183
5185 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5186
5187 if (StatlistLo.empty()) {
5188 Mods |= SISrcMods::OP_SEL_1;
5189 return {Stat.first, Mods};
5190 }
5191
5192 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5193 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5194 if (StatlistHi[I].first == StatlistLo[J].first &&
5195 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5196 StatlistHi[I].first, RootReg, TII, MRI))
5197 return {StatlistHi[I].first,
5198 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5199 }
5200 }
5201 // Packed instructions do not have abs modifiers.
5202 Mods |= SISrcMods::OP_SEL_1;
5203
5204 return {Stat.first, Mods};
5205}
5206
5207// Removed unused function `getAllKindImm` to eliminate dead code.
5208
5209static bool checkRB(Register Reg, unsigned int RBNo,
5210 const AMDGPURegisterBankInfo &RBI,
5211 const MachineRegisterInfo &MRI,
5212 const TargetRegisterInfo &TRI) {
5213 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5214 return RB->getID() == RBNo;
5215}
5216
5217// This function is used to get the correct register bank for returned reg.
5218// Assume:
5219// 1. VOP3P is always legal for VGPR.
5220// 2. RootOp's regbank is legal.
5221// Thus
5222// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5223// 2. If RootOp is VGPR, then NewOp must be VGPR.
5225 const AMDGPURegisterBankInfo &RBI,
5227 const TargetRegisterInfo &TRI,
5228 const SIInstrInfo &TII) {
5229 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5230 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5231 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5232 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5233 return NewReg;
5234
5235 MachineInstr *MI = MRI.getVRegDef(RootReg);
5236 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5237 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5238 return RootReg;
5239 }
5240
5241 MachineBasicBlock *BB = MI->getParent();
5242 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5243
5245 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5246 .addReg(NewReg);
5247
5248 // Only accept VGPR.
5249 return MIB->getOperand(0).getReg();
5250}
5251
5253AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5254 bool IsDOT) const {
5255 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5256 Register Reg;
5257 unsigned Mods;
5258 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5259
5260 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5261 return {{
5262 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5263 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5264 }};
5265}
5266
5268AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5269
5270 return selectVOP3PRetHelper(Root);
5271}
5272
5274AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5275
5276 return selectVOP3PRetHelper(Root, true);
5277}
5278
5280AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5281 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5282 Register Src;
5283 unsigned Mods;
5284 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5285 if (Mods != SISrcMods::OP_SEL_1)
5286 return {};
5287
5288 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5289}
5290
5292AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5293 Register Src;
5294 unsigned Mods;
5295 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5296
5297 return {{
5298 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5299 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5300 }};
5301}
5302
5304AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5305 Register Src;
5306 unsigned Mods;
5307 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5308 if (Mods != SISrcMods::OP_SEL_1)
5309 return {};
5310
5311 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5312}
5313
5315AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5316 MachineOperand &Root) const {
5317 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5318 "expected i1 value");
5319 unsigned Mods = SISrcMods::OP_SEL_1;
5320 if (Root.getImm() != 0)
5321 Mods |= SISrcMods::OP_SEL_0;
5322
5323 return {{
5324 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5325 }};
5326}
5327
5329 MachineInstr *InsertPt,
5330 MachineRegisterInfo &MRI) {
5331 const TargetRegisterClass *DstRegClass;
5332 switch (Elts.size()) {
5333 case 8:
5334 DstRegClass = &AMDGPU::VReg_256RegClass;
5335 break;
5336 case 4:
5337 DstRegClass = &AMDGPU::VReg_128RegClass;
5338 break;
5339 case 2:
5340 DstRegClass = &AMDGPU::VReg_64RegClass;
5341 break;
5342 default:
5343 llvm_unreachable("unhandled Reg sequence size");
5344 }
5345
5346 MachineIRBuilder B(*InsertPt);
5347 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5348 .addDef(MRI.createVirtualRegister(DstRegClass));
5349 for (unsigned i = 0; i < Elts.size(); ++i) {
5350 MIB.addReg(Elts[i]);
5352 }
5353 return MIB->getOperand(0).getReg();
5354}
5355
5356static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5358 MachineInstr *InsertPt,
5359 MachineRegisterInfo &MRI) {
5360 if (ModOpcode == TargetOpcode::G_FNEG) {
5361 Mods |= SISrcMods::NEG;
5362 // Check if all elements also have abs modifier
5363 SmallVector<Register, 8> NegAbsElts;
5364 for (auto El : Elts) {
5365 Register FabsSrc;
5366 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5367 break;
5368 NegAbsElts.push_back(FabsSrc);
5369 }
5370 if (Elts.size() != NegAbsElts.size()) {
5371 // Neg
5372 Src = buildRegSequence(Elts, InsertPt, MRI);
5373 } else {
5374 // Neg and Abs
5375 Mods |= SISrcMods::NEG_HI;
5376 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5377 }
5378 } else {
5379 assert(ModOpcode == TargetOpcode::G_FABS);
5380 // Abs
5381 Mods |= SISrcMods::NEG_HI;
5382 Src = buildRegSequence(Elts, InsertPt, MRI);
5383 }
5384}
5385
5387AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5388 Register Src = Root.getReg();
5389 unsigned Mods = SISrcMods::OP_SEL_1;
5391
5392 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5393 assert(BV->getNumSources() > 0);
5394 // Based on first element decide which mod we match, neg or abs
5395 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5396 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5397 ? AMDGPU::G_FNEG
5398 : AMDGPU::G_FABS;
5399 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5400 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5401 if (ElF32->getOpcode() != ModOpcode)
5402 break;
5403 EltsF32.push_back(ElF32->getOperand(1).getReg());
5404 }
5405
5406 // All elements had ModOpcode modifier
5407 if (BV->getNumSources() == EltsF32.size()) {
5408 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5409 *MRI);
5410 }
5411 }
5412
5413 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5414 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5415}
5416
5418AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5419 Register Src = Root.getReg();
5420 unsigned Mods = SISrcMods::OP_SEL_1;
5421 SmallVector<Register, 8> EltsV2F16;
5422
5423 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5424 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5425 Register FNegSrc;
5426 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5427 break;
5428 EltsV2F16.push_back(FNegSrc);
5429 }
5430
5431 // All elements had ModOpcode modifier
5432 if (CV->getNumSources() == EltsV2F16.size()) {
5433 Mods |= SISrcMods::NEG;
5434 Mods |= SISrcMods::NEG_HI;
5435 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5436 }
5437 }
5438
5439 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5440 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5441}
5442
5444AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5445 Register Src = Root.getReg();
5446 unsigned Mods = SISrcMods::OP_SEL_1;
5447 SmallVector<Register, 8> EltsV2F16;
5448
5449 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5450 assert(CV->getNumSources() > 0);
5451 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5452 // Based on first element decide which mod we match, neg or abs
5453 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5454 ? AMDGPU::G_FNEG
5455 : AMDGPU::G_FABS;
5456
5457 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5458 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5459 if (ElV2F16->getOpcode() != ModOpcode)
5460 break;
5461 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5462 }
5463
5464 // All elements had ModOpcode modifier
5465 if (CV->getNumSources() == EltsV2F16.size()) {
5466 MachineIRBuilder B(*Root.getParent());
5467 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5468 *MRI);
5469 }
5470 }
5471
5472 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5473 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5474}
5475
5477AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5478 std::optional<FPValueAndVReg> FPValReg;
5479 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5480 if (TII.isInlineConstant(FPValReg->Value)) {
5481 return {{[=](MachineInstrBuilder &MIB) {
5482 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5483 }}};
5484 }
5485 // Non-inlineable splat floats should not fall-through for integer immediate
5486 // checks.
5487 return {};
5488 }
5489
5490 APInt ICst;
5491 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5492 if (TII.isInlineConstant(ICst)) {
5493 return {
5494 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5495 }
5496 }
5497
5498 return {};
5499}
5500
5502AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5503 Register Src =
5504 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5505 unsigned Key = 0;
5506
5507 Register ShiftSrc;
5508 std::optional<ValueAndVReg> ShiftAmt;
5509 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5510 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5511 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5512 Key = ShiftAmt->Value.getZExtValue() / 8;
5513 Src = ShiftSrc;
5514 }
5515
5516 return {{
5517 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5518 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5519 }};
5520}
5521
5523AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5524
5525 Register Src =
5526 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5527 unsigned Key = 0;
5528
5529 Register ShiftSrc;
5530 std::optional<ValueAndVReg> ShiftAmt;
5531 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5532 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5533 ShiftAmt->Value.getZExtValue() == 16) {
5534 Src = ShiftSrc;
5535 Key = 1;
5536 }
5537
5538 return {{
5539 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5540 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5541 }};
5542}
5543
5545AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5546 Register Src =
5547 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5548 unsigned Key = 0;
5549
5550 Register S32 = matchZeroExtendFromS32(Src);
5551 if (!S32)
5552 S32 = matchAnyExtendFromS32(Src);
5553
5554 if (S32) {
5555 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5556 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5557 assert(Def->getNumOperands() == 3);
5558 Register DstReg1 = Def->getOperand(1).getReg();
5559 if (mi_match(S32, *MRI,
5560 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5561 Src = Def->getOperand(2).getReg();
5562 Key = 1;
5563 }
5564 }
5565 }
5566
5567 return {{
5568 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5569 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5570 }};
5571}
5572
5574AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5575 Register Src;
5576 unsigned Mods;
5577 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5578
5579 // FIXME: Handle op_sel
5580 return {{
5581 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5582 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5583 }};
5584}
5585
5586// FIXME-TRUE16 remove when fake16 is removed
5588AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5589 Register Src;
5590 unsigned Mods;
5591 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5592 /*IsCanonicalizing=*/true,
5593 /*AllowAbs=*/false,
5594 /*OpSel=*/false);
5595
5596 return {{
5597 [=](MachineInstrBuilder &MIB) {
5598 MIB.addReg(
5599 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5600 },
5601 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5602 }};
5603}
5604
5606AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5607 Register Src;
5608 unsigned Mods;
5609 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5610 /*IsCanonicalizing=*/true,
5611 /*AllowAbs=*/false,
5612 /*OpSel=*/true);
5613
5614 return {{
5615 [=](MachineInstrBuilder &MIB) {
5616 MIB.addReg(
5617 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5618 },
5619 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5620 }};
5621}
5622
5623// Given \p Offset and load specified by the \p Root operand check if \p Offset
5624// is a multiple of the load byte size. If it is update \p Offset to a
5625// pre-scaled value and return true.
5626bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5628 bool IsSigned) const {
5629 if (!Subtarget->hasScaleOffset())
5630 return false;
5631
5632 const MachineInstr &MI = *Root.getParent();
5633 MachineMemOperand *MMO = *MI.memoperands_begin();
5634
5635 if (!MMO->getSize().hasValue())
5636 return false;
5637
5638 uint64_t Size = MMO->getSize().getValue();
5639
5640 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5641 if (!OffsetReg)
5642 OffsetReg = Offset;
5643
5644 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5645 OffsetReg = Def->Reg;
5646
5647 Register Op0;
5648 MachineInstr *Mul;
5649 bool ScaleOffset =
5650 (isPowerOf2_64(Size) &&
5651 mi_match(OffsetReg, *MRI,
5652 m_GShl(m_Reg(Op0),
5655 mi_match(OffsetReg, *MRI,
5657 m_Copy(m_SpecificICst(Size))))) ||
5658 mi_match(
5659 OffsetReg, *MRI,
5660 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5661 m_Reg(Op0), m_SpecificICst(Size))) ||
5662 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5663 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5664 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5665 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5666 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5667 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5668 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5669 mi_match(Mul->getOperand(3).getReg(), *MRI,
5671 m_Copy(m_SpecificICst(Size))))) &&
5672 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5673
5674 if (ScaleOffset)
5675 Offset = Op0;
5676
5677 return ScaleOffset;
5678}
5679
5680bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5681 Register &Base,
5682 Register *SOffset,
5683 int64_t *Offset,
5684 bool *ScaleOffset) const {
5685 MachineInstr *MI = Root.getParent();
5686 MachineBasicBlock *MBB = MI->getParent();
5687
5688 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5689 // then we can select all ptr + 32-bit offsets.
5690 SmallVector<GEPInfo, 4> AddrInfo;
5691 getAddrModeInfo(*MI, *MRI, AddrInfo);
5692
5693 if (AddrInfo.empty())
5694 return false;
5695
5696 const GEPInfo &GEPI = AddrInfo[0];
5697 std::optional<int64_t> EncodedImm;
5698
5699 if (ScaleOffset)
5700 *ScaleOffset = false;
5701
5702 if (SOffset && Offset) {
5703 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5704 /*HasSOffset=*/true);
5705 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5706 AddrInfo.size() > 1) {
5707 const GEPInfo &GEPI2 = AddrInfo[1];
5708 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5709 Register OffsetReg = GEPI2.SgprParts[1];
5710 if (ScaleOffset)
5711 *ScaleOffset =
5712 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5713 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5714 if (OffsetReg) {
5715 Base = GEPI2.SgprParts[0];
5716 *SOffset = OffsetReg;
5717 *Offset = *EncodedImm;
5718 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5719 return true;
5720
5721 // For unbuffered smem loads, it is illegal for the Immediate Offset
5722 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5723 // is negative. Handle the case where the Immediate Offset + SOffset
5724 // is negative.
5725 auto SKnown = VT->getKnownBits(*SOffset);
5726 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5727 return false;
5728
5729 return true;
5730 }
5731 }
5732 }
5733 return false;
5734 }
5735
5736 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5737 /*HasSOffset=*/false);
5738 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5739 Base = GEPI.SgprParts[0];
5740 *Offset = *EncodedImm;
5741 return true;
5742 }
5743
5744 // SGPR offset is unsigned.
5745 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5746 GEPI.Imm != 0) {
5747 // If we make it this far we have a load with an 32-bit immediate offset.
5748 // It is OK to select this using a sgpr offset, because we have already
5749 // failed trying to select this load into one of the _IMM variants since
5750 // the _IMM Patterns are considered before the _SGPR patterns.
5751 Base = GEPI.SgprParts[0];
5752 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5753 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5754 .addImm(GEPI.Imm);
5755 return true;
5756 }
5757
5758 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5759 Register OffsetReg = GEPI.SgprParts[1];
5760 if (ScaleOffset)
5761 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5762 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5763 if (OffsetReg) {
5764 Base = GEPI.SgprParts[0];
5765 *SOffset = OffsetReg;
5766 return true;
5767 }
5768 }
5769
5770 return false;
5771}
5772
5774AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5775 Register Base;
5776 int64_t Offset;
5777 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5778 /* ScaleOffset */ nullptr))
5779 return std::nullopt;
5780
5781 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5782 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5783}
5784
5786AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5787 SmallVector<GEPInfo, 4> AddrInfo;
5788 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5789
5790 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5791 return std::nullopt;
5792
5793 const GEPInfo &GEPInfo = AddrInfo[0];
5794 Register PtrReg = GEPInfo.SgprParts[0];
5795 std::optional<int64_t> EncodedImm =
5796 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5797 if (!EncodedImm)
5798 return std::nullopt;
5799
5800 return {{
5801 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5802 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5803 }};
5804}
5805
5807AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5808 Register Base, SOffset;
5809 bool ScaleOffset;
5810 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5811 &ScaleOffset))
5812 return std::nullopt;
5813
5814 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5815 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5816 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5817 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5818}
5819
5821AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5822 Register Base, SOffset;
5823 int64_t Offset;
5824 bool ScaleOffset;
5825 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5826 return std::nullopt;
5827
5828 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5829 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5830 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5831 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5832 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5833}
5834
5835std::pair<Register, int>
5836AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5837 uint64_t FlatVariant) const {
5838 MachineInstr *MI = Root.getParent();
5839
5840 auto Default = std::pair(Root.getReg(), 0);
5841
5842 if (!STI.hasFlatInstOffsets())
5843 return Default;
5844
5845 Register PtrBase;
5846 int64_t ConstOffset;
5847 bool IsInBounds;
5848 std::tie(PtrBase, ConstOffset, IsInBounds) =
5849 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5850
5851 // Adding the offset to the base address with an immediate in a FLAT
5852 // instruction must not change the memory aperture in which the address falls.
5853 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5854 // instructions.
5855 if (ConstOffset == 0 ||
5856 (FlatVariant == SIInstrFlags::FlatScratch &&
5857 !isFlatScratchBaseLegal(Root.getReg())) ||
5858 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5859 return Default;
5860
5861 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5862 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5863 return Default;
5864
5865 return std::pair(PtrBase, ConstOffset);
5866}
5867
5869AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5870 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5871
5872 return {{
5873 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5874 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5875 }};
5876}
5877
5879AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5880 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5881
5882 return {{
5883 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5884 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5885 }};
5886}
5887
5889AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5890 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5891
5892 return {{
5893 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5894 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5895 }};
5896}
5897
5898// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5900AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5901 unsigned CPolBits,
5902 bool NeedIOffset) const {
5903 Register Addr = Root.getReg();
5904 Register PtrBase;
5905 int64_t ConstOffset;
5906 int64_t ImmOffset = 0;
5907
5908 // Match the immediate offset first, which canonically is moved as low as
5909 // possible.
5910 std::tie(PtrBase, ConstOffset, std::ignore) =
5911 getPtrBaseWithConstantOffset(Addr, *MRI);
5912
5913 if (ConstOffset != 0) {
5914 if (NeedIOffset &&
5915 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5917 Addr = PtrBase;
5918 ImmOffset = ConstOffset;
5919 } else {
5920 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5921 if (isSGPR(PtrBaseDef->Reg)) {
5922 if (ConstOffset > 0) {
5923 // Offset is too large.
5924 //
5925 // saddr + large_offset -> saddr +
5926 // (voffset = large_offset & ~MaxOffset) +
5927 // (large_offset & MaxOffset);
5928 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5929 if (NeedIOffset) {
5930 std::tie(SplitImmOffset, RemainderOffset) =
5931 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5933 }
5934
5935 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5936 : isUInt<32>(RemainderOffset)) {
5937 MachineInstr *MI = Root.getParent();
5938 MachineBasicBlock *MBB = MI->getParent();
5939 Register HighBits =
5940 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5941
5942 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5943 HighBits)
5944 .addImm(RemainderOffset);
5945
5946 if (NeedIOffset)
5947 return {{
5948 [=](MachineInstrBuilder &MIB) {
5949 MIB.addReg(PtrBase);
5950 }, // saddr
5951 [=](MachineInstrBuilder &MIB) {
5952 MIB.addReg(HighBits);
5953 }, // voffset
5954 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5955 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5956 }};
5957 return {{
5958 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5959 [=](MachineInstrBuilder &MIB) {
5960 MIB.addReg(HighBits);
5961 }, // voffset
5962 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5963 }};
5964 }
5965 }
5966
5967 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5968 // is 1 we would need to perform 1 or 2 extra moves for each half of
5969 // the constant and it is better to do a scalar add and then issue a
5970 // single VALU instruction to materialize zero. Otherwise it is less
5971 // instructions to perform VALU adds with immediates or inline literals.
5972 unsigned NumLiterals =
5973 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5974 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5975 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5976 return std::nullopt;
5977 }
5978 }
5979 }
5980
5981 // Match the variable offset.
5982 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5983 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5984 // Look through the SGPR->VGPR copy.
5985 Register SAddr =
5986 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5987
5988 if (isSGPR(SAddr)) {
5989 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5990
5991 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5992 // inserted later.
5993 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5994 Subtarget->hasSignedGVSOffset());
5995 if (Register VOffset = matchExtendFromS32OrS32(
5996 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5997 if (NeedIOffset)
5998 return {{[=](MachineInstrBuilder &MIB) { // saddr
5999 MIB.addReg(SAddr);
6000 },
6001 [=](MachineInstrBuilder &MIB) { // voffset
6002 MIB.addReg(VOffset);
6003 },
6004 [=](MachineInstrBuilder &MIB) { // offset
6005 MIB.addImm(ImmOffset);
6006 },
6007 [=](MachineInstrBuilder &MIB) { // cpol
6008 MIB.addImm(CPolBits |
6009 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6010 }}};
6011 return {{[=](MachineInstrBuilder &MIB) { // saddr
6012 MIB.addReg(SAddr);
6013 },
6014 [=](MachineInstrBuilder &MIB) { // voffset
6015 MIB.addReg(VOffset);
6016 },
6017 [=](MachineInstrBuilder &MIB) { // cpol
6018 MIB.addImm(CPolBits |
6019 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6020 }}};
6021 }
6022 }
6023 }
6024
6025 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6026 // drop this.
6027 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6028 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6029 return std::nullopt;
6030
6031 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6032 // moves required to copy a 64-bit SGPR to VGPR.
6033 MachineInstr *MI = Root.getParent();
6034 MachineBasicBlock *MBB = MI->getParent();
6035 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6036
6037 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6038 .addImm(0);
6039
6040 if (NeedIOffset)
6041 return {{
6042 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6043 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6044 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6045 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6046 }};
6047 return {{
6048 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6049 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6050 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6051 }};
6052}
6053
6055AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6056 return selectGlobalSAddr(Root, 0);
6057}
6058
6060AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6061 const MachineInstr &I = *Root.getParent();
6062
6063 // We are assuming CPol is always the last operand of the intrinsic.
6064 auto PassedCPol =
6065 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6066 return selectGlobalSAddr(Root, PassedCPol);
6067}
6068
6070AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6071 const MachineInstr &I = *Root.getParent();
6072
6073 // We are assuming CPol is second from last operand of the intrinsic.
6074 auto PassedCPol =
6075 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6076 return selectGlobalSAddr(Root, PassedCPol);
6077}
6078
6080AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6081 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6082}
6083
6085AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6086 MachineOperand &Root) const {
6087 const MachineInstr &I = *Root.getParent();
6088
6089 // We are assuming CPol is always the last operand of the intrinsic.
6090 auto PassedCPol =
6091 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6092 return selectGlobalSAddr(Root, PassedCPol, false);
6093}
6094
6096AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6097 MachineOperand &Root) const {
6098 const MachineInstr &I = *Root.getParent();
6099
6100 // We are assuming CPol is second from last operand of the intrinsic.
6101 auto PassedCPol =
6102 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6103 return selectGlobalSAddr(Root, PassedCPol, false);
6104}
6105
6107AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6108 Register Addr = Root.getReg();
6109 Register PtrBase;
6110 int64_t ConstOffset;
6111 int64_t ImmOffset = 0;
6112
6113 // Match the immediate offset first, which canonically is moved as low as
6114 // possible.
6115 std::tie(PtrBase, ConstOffset, std::ignore) =
6116 getPtrBaseWithConstantOffset(Addr, *MRI);
6117
6118 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6119 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6121 Addr = PtrBase;
6122 ImmOffset = ConstOffset;
6123 }
6124
6125 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6126 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6127 int FI = AddrDef->MI->getOperand(1).getIndex();
6128 return {{
6129 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6130 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6131 }};
6132 }
6133
6134 Register SAddr = AddrDef->Reg;
6135
6136 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6137 Register LHS = AddrDef->MI->getOperand(1).getReg();
6138 Register RHS = AddrDef->MI->getOperand(2).getReg();
6139 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6140 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6141
6142 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6143 isSGPR(RHSDef->Reg)) {
6144 int FI = LHSDef->MI->getOperand(1).getIndex();
6145 MachineInstr &I = *Root.getParent();
6146 MachineBasicBlock *BB = I.getParent();
6147 const DebugLoc &DL = I.getDebugLoc();
6148 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6149
6150 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6151 .addFrameIndex(FI)
6152 .addReg(RHSDef->Reg)
6153 .setOperandDead(3); // Dead scc
6154 }
6155 }
6156
6157 if (!isSGPR(SAddr))
6158 return std::nullopt;
6159
6160 return {{
6161 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6162 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6163 }};
6164}
6165
6166// Check whether the flat scratch SVS swizzle bug affects this access.
6167bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6168 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6169 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6170 return false;
6171
6172 // The bug affects the swizzling of SVS accesses if there is any carry out
6173 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6174 // voffset to (soffset + inst_offset).
6175 auto VKnown = VT->getKnownBits(VAddr);
6176 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6177 KnownBits::makeConstant(APInt(32, ImmOffset)));
6178 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6179 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6180 return (VMax & 3) + (SMax & 3) >= 4;
6181}
6182
6184AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6185 Register Addr = Root.getReg();
6186 Register PtrBase;
6187 int64_t ConstOffset;
6188 int64_t ImmOffset = 0;
6189
6190 // Match the immediate offset first, which canonically is moved as low as
6191 // possible.
6192 std::tie(PtrBase, ConstOffset, std::ignore) =
6193 getPtrBaseWithConstantOffset(Addr, *MRI);
6194
6195 Register OrigAddr = Addr;
6196 if (ConstOffset != 0 &&
6197 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6199 Addr = PtrBase;
6200 ImmOffset = ConstOffset;
6201 }
6202
6203 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6204 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6205 return std::nullopt;
6206
6207 Register RHS = AddrDef->MI->getOperand(2).getReg();
6208 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6209 return std::nullopt;
6210
6211 Register LHS = AddrDef->MI->getOperand(1).getReg();
6212 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6213
6214 if (OrigAddr != Addr) {
6215 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6216 return std::nullopt;
6217 } else {
6218 if (!isFlatScratchBaseLegalSV(OrigAddr))
6219 return std::nullopt;
6220 }
6221
6222 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6223 return std::nullopt;
6224
6225 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6227 : 0;
6228
6229 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6230 int FI = LHSDef->MI->getOperand(1).getIndex();
6231 return {{
6232 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6233 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6234 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6235 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6236 }};
6237 }
6238
6239 if (!isSGPR(LHS))
6240 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6241 LHS = Def->Reg;
6242
6243 if (!isSGPR(LHS))
6244 return std::nullopt;
6245
6246 return {{
6247 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6248 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6249 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6250 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6251 }};
6252}
6253
6255AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6256 MachineInstr *MI = Root.getParent();
6257 MachineBasicBlock *MBB = MI->getParent();
6258 MachineFunction *MF = MBB->getParent();
6259 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6260
6261 int64_t Offset = 0;
6262 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6264 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6265
6266 // TODO: Should this be inside the render function? The iterator seems to
6267 // move.
6268 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6269 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6270 HighBits)
6271 .addImm(Offset & ~MaxOffset);
6272
6273 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6274 MIB.addReg(Info->getScratchRSrcReg());
6275 },
6276 [=](MachineInstrBuilder &MIB) { // vaddr
6277 MIB.addReg(HighBits);
6278 },
6279 [=](MachineInstrBuilder &MIB) { // soffset
6280 // Use constant zero for soffset and rely on eliminateFrameIndex
6281 // to choose the appropriate frame register if need be.
6282 MIB.addImm(0);
6283 },
6284 [=](MachineInstrBuilder &MIB) { // offset
6285 MIB.addImm(Offset & MaxOffset);
6286 }}};
6287 }
6288
6289 assert(Offset == 0 || Offset == -1);
6290
6291 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6292 // offsets.
6293 std::optional<int> FI;
6294 Register VAddr = Root.getReg();
6295
6296 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6297 Register PtrBase;
6298 int64_t ConstOffset;
6299 std::tie(PtrBase, ConstOffset, std::ignore) =
6300 getPtrBaseWithConstantOffset(VAddr, *MRI);
6301 if (ConstOffset != 0) {
6302 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6303 (!STI.privateMemoryResourceIsRangeChecked() ||
6304 VT->signBitIsZero(PtrBase))) {
6305 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6306 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6307 FI = PtrBaseDef->getOperand(1).getIndex();
6308 else
6309 VAddr = PtrBase;
6310 Offset = ConstOffset;
6311 }
6312 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6313 FI = RootDef->getOperand(1).getIndex();
6314 }
6315
6316 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6317 MIB.addReg(Info->getScratchRSrcReg());
6318 },
6319 [=](MachineInstrBuilder &MIB) { // vaddr
6320 if (FI)
6321 MIB.addFrameIndex(*FI);
6322 else
6323 MIB.addReg(VAddr);
6324 },
6325 [=](MachineInstrBuilder &MIB) { // soffset
6326 // Use constant zero for soffset and rely on eliminateFrameIndex
6327 // to choose the appropriate frame register if need be.
6328 MIB.addImm(0);
6329 },
6330 [=](MachineInstrBuilder &MIB) { // offset
6331 MIB.addImm(Offset);
6332 }}};
6333}
6334
6335bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6336 int64_t Offset) const {
6337 if (!isUInt<16>(Offset))
6338 return false;
6339
6340 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6341 return true;
6342
6343 // On Southern Islands instruction with a negative base value and an offset
6344 // don't seem to work.
6345 return VT->signBitIsZero(Base);
6346}
6347
6348bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6349 int64_t Offset1,
6350 unsigned Size) const {
6351 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6352 return false;
6353 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6354 return false;
6355
6356 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6357 return true;
6358
6359 // On Southern Islands instruction with a negative base value and an offset
6360 // don't seem to work.
6361 return VT->signBitIsZero(Base);
6362}
6363
6364// Return whether the operation has NoUnsignedWrap property.
6365static bool isNoUnsignedWrap(MachineInstr *Addr) {
6366 return Addr->getOpcode() == TargetOpcode::G_OR ||
6367 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6369}
6370
6371// Check that the base address of flat scratch load/store in the form of `base +
6372// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6373// requirement). We always treat the first operand as the base address here.
6374bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6375 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6376
6377 if (isNoUnsignedWrap(AddrMI))
6378 return true;
6379
6380 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6381 // values.
6382 if (STI.hasSignedScratchOffsets())
6383 return true;
6384
6385 Register LHS = AddrMI->getOperand(1).getReg();
6386 Register RHS = AddrMI->getOperand(2).getReg();
6387
6388 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6389 std::optional<ValueAndVReg> RhsValReg =
6391 // If the immediate offset is negative and within certain range, the base
6392 // address cannot also be negative. If the base is also negative, the sum
6393 // would be either negative or much larger than the valid range of scratch
6394 // memory a thread can access.
6395 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6396 RhsValReg->Value.getSExtValue() > -0x40000000)
6397 return true;
6398 }
6399
6400 return VT->signBitIsZero(LHS);
6401}
6402
6403// Check address value in SGPR/VGPR are legal for flat scratch in the form
6404// of: SGPR + VGPR.
6405bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6406 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6407
6408 if (isNoUnsignedWrap(AddrMI))
6409 return true;
6410
6411 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6412 // values.
6413 if (STI.hasSignedScratchOffsets())
6414 return true;
6415
6416 Register LHS = AddrMI->getOperand(1).getReg();
6417 Register RHS = AddrMI->getOperand(2).getReg();
6418 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6419}
6420
6421// Check address value in SGPR/VGPR are legal for flat scratch in the form
6422// of: SGPR + VGPR + Imm.
6423bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6424 Register Addr) const {
6425 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6426 // values.
6427 if (STI.hasSignedScratchOffsets())
6428 return true;
6429
6430 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6431 Register Base = AddrMI->getOperand(1).getReg();
6432 std::optional<DefinitionAndSourceRegister> BaseDef =
6434 std::optional<ValueAndVReg> RHSOffset =
6436 assert(RHSOffset);
6437
6438 // If the immediate offset is negative and within certain range, the base
6439 // address cannot also be negative. If the base is also negative, the sum
6440 // would be either negative or much larger than the valid range of scratch
6441 // memory a thread can access.
6442 if (isNoUnsignedWrap(BaseDef->MI) &&
6443 (isNoUnsignedWrap(AddrMI) ||
6444 (RHSOffset->Value.getSExtValue() < 0 &&
6445 RHSOffset->Value.getSExtValue() > -0x40000000)))
6446 return true;
6447
6448 Register LHS = BaseDef->MI->getOperand(1).getReg();
6449 Register RHS = BaseDef->MI->getOperand(2).getReg();
6450 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6451}
6452
6453bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6454 unsigned ShAmtBits) const {
6455 assert(MI.getOpcode() == TargetOpcode::G_AND);
6456
6457 std::optional<APInt> RHS =
6458 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6459 if (!RHS)
6460 return false;
6461
6462 if (RHS->countr_one() >= ShAmtBits)
6463 return true;
6464
6465 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6466 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6467}
6468
6470AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6471 MachineOperand &Root) const {
6472 Register Reg = Root.getReg();
6473 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6474
6475 std::optional<DefinitionAndSourceRegister> Def =
6477 assert(Def && "this shouldn't be an optional result");
6478 Reg = Def->Reg;
6479
6480 if (Register WaveBase = getWaveAddress(Def->MI)) {
6481 return {{
6482 [=](MachineInstrBuilder &MIB) { // rsrc
6483 MIB.addReg(Info->getScratchRSrcReg());
6484 },
6485 [=](MachineInstrBuilder &MIB) { // soffset
6486 MIB.addReg(WaveBase);
6487 },
6488 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6489 }};
6490 }
6491
6492 int64_t Offset = 0;
6493
6494 // FIXME: Copy check is a hack
6496 if (mi_match(Reg, *MRI,
6497 m_GPtrAdd(m_Reg(BasePtr),
6499 if (!TII.isLegalMUBUFImmOffset(Offset))
6500 return {};
6501 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6502 Register WaveBase = getWaveAddress(BasePtrDef);
6503 if (!WaveBase)
6504 return {};
6505
6506 return {{
6507 [=](MachineInstrBuilder &MIB) { // rsrc
6508 MIB.addReg(Info->getScratchRSrcReg());
6509 },
6510 [=](MachineInstrBuilder &MIB) { // soffset
6511 MIB.addReg(WaveBase);
6512 },
6513 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6514 }};
6515 }
6516
6517 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6518 !TII.isLegalMUBUFImmOffset(Offset))
6519 return {};
6520
6521 return {{
6522 [=](MachineInstrBuilder &MIB) { // rsrc
6523 MIB.addReg(Info->getScratchRSrcReg());
6524 },
6525 [=](MachineInstrBuilder &MIB) { // soffset
6526 MIB.addImm(0);
6527 },
6528 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6529 }};
6530}
6531
6532std::pair<Register, unsigned>
6533AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6534 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6535 int64_t ConstAddr = 0;
6536
6537 Register PtrBase;
6538 int64_t Offset;
6539 std::tie(PtrBase, Offset, std::ignore) =
6540 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6541
6542 if (Offset) {
6543 if (isDSOffsetLegal(PtrBase, Offset)) {
6544 // (add n0, c0)
6545 return std::pair(PtrBase, Offset);
6546 }
6547 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6548 // TODO
6549
6550
6551 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6552 // TODO
6553
6554 }
6555
6556 return std::pair(Root.getReg(), 0);
6557}
6558
6560AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6561 Register Reg;
6562 unsigned Offset;
6563 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6564 return {{
6565 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6566 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6567 }};
6568}
6569
6571AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6572 return selectDSReadWrite2(Root, 4);
6573}
6574
6576AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6577 return selectDSReadWrite2(Root, 8);
6578}
6579
6581AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6582 unsigned Size) const {
6583 Register Reg;
6584 unsigned Offset;
6585 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6586 return {{
6587 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6588 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6589 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6590 }};
6591}
6592
6593std::pair<Register, unsigned>
6594AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6595 unsigned Size) const {
6596 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6597 int64_t ConstAddr = 0;
6598
6599 Register PtrBase;
6600 int64_t Offset;
6601 std::tie(PtrBase, Offset, std::ignore) =
6602 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6603
6604 if (Offset) {
6605 int64_t OffsetValue0 = Offset;
6606 int64_t OffsetValue1 = Offset + Size;
6607 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6608 // (add n0, c0)
6609 return std::pair(PtrBase, OffsetValue0 / Size);
6610 }
6611 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6612 // TODO
6613
6614 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6615 // TODO
6616
6617 }
6618
6619 return std::pair(Root.getReg(), 0);
6620}
6621
6622/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6623/// the base value with the constant offset, and if the offset computation is
6624/// known to be inbounds. There may be intervening copies between \p Root and
6625/// the identified constant. Returns \p Root, 0, false if this does not match
6626/// the pattern.
6627std::tuple<Register, int64_t, bool>
6628AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6629 Register Root, const MachineRegisterInfo &MRI) const {
6630 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6631 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6632 return {Root, 0, false};
6633
6634 MachineOperand &RHS = RootI->getOperand(2);
6635 std::optional<ValueAndVReg> MaybeOffset =
6637 if (!MaybeOffset)
6638 return {Root, 0, false};
6639 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6640 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6641 IsInBounds};
6642}
6643
6645 MIB.addImm(0);
6646}
6647
6648/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6649/// BasePtr is not valid, a null base pointer will be used.
6651 uint32_t FormatLo, uint32_t FormatHi,
6652 Register BasePtr) {
6653 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6654 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6655 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6656 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6657
6658 B.buildInstr(AMDGPU::S_MOV_B32)
6659 .addDef(RSrc2)
6660 .addImm(FormatLo);
6661 B.buildInstr(AMDGPU::S_MOV_B32)
6662 .addDef(RSrc3)
6663 .addImm(FormatHi);
6664
6665 // Build the half of the subregister with the constants before building the
6666 // full 128-bit register. If we are building multiple resource descriptors,
6667 // this will allow CSEing of the 2-component register.
6668 B.buildInstr(AMDGPU::REG_SEQUENCE)
6669 .addDef(RSrcHi)
6670 .addReg(RSrc2)
6671 .addImm(AMDGPU::sub0)
6672 .addReg(RSrc3)
6673 .addImm(AMDGPU::sub1);
6674
6675 Register RSrcLo = BasePtr;
6676 if (!BasePtr) {
6677 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6678 B.buildInstr(AMDGPU::S_MOV_B64)
6679 .addDef(RSrcLo)
6680 .addImm(0);
6681 }
6682
6683 B.buildInstr(AMDGPU::REG_SEQUENCE)
6684 .addDef(RSrc)
6685 .addReg(RSrcLo)
6686 .addImm(AMDGPU::sub0_sub1)
6687 .addReg(RSrcHi)
6688 .addImm(AMDGPU::sub2_sub3);
6689
6690 return RSrc;
6691}
6692
6694 const SIInstrInfo &TII, Register BasePtr) {
6695 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6696
6697 // FIXME: Why are half the "default" bits ignored based on the addressing
6698 // mode?
6699 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6700}
6701
6703 const SIInstrInfo &TII, Register BasePtr) {
6704 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6705
6706 // FIXME: Why are half the "default" bits ignored based on the addressing
6707 // mode?
6708 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6709}
6710
6711AMDGPUInstructionSelector::MUBUFAddressData
6712AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6713 MUBUFAddressData Data;
6714 Data.N0 = Src;
6715
6716 Register PtrBase;
6717 int64_t Offset;
6718
6719 std::tie(PtrBase, Offset, std::ignore) =
6720 getPtrBaseWithConstantOffset(Src, *MRI);
6721 if (isUInt<32>(Offset)) {
6722 Data.N0 = PtrBase;
6723 Data.Offset = Offset;
6724 }
6725
6726 if (MachineInstr *InputAdd
6727 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6728 Data.N2 = InputAdd->getOperand(1).getReg();
6729 Data.N3 = InputAdd->getOperand(2).getReg();
6730
6731 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6732 // FIXME: Don't know this was defined by operand 0
6733 //
6734 // TODO: Remove this when we have copy folding optimizations after
6735 // RegBankSelect.
6736 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6737 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6738 }
6739
6740 return Data;
6741}
6742
6743/// Return if the addr64 mubuf mode should be used for the given address.
6744bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6745 // (ptr_add N2, N3) -> addr64, or
6746 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6747 if (Addr.N2)
6748 return true;
6749
6750 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6751 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6752}
6753
6754/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6755/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6756/// component.
6757void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6758 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6759 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6760 return;
6761
6762 // Illegal offset, store it in soffset.
6763 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6764 B.buildInstr(AMDGPU::S_MOV_B32)
6765 .addDef(SOffset)
6766 .addImm(ImmOffset);
6767 ImmOffset = 0;
6768}
6769
6770bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6771 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6772 Register &SOffset, int64_t &Offset) const {
6773 // FIXME: Predicates should stop this from reaching here.
6774 // addr64 bit was removed for volcanic islands.
6775 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6776 return false;
6777
6778 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6779 if (!shouldUseAddr64(AddrData))
6780 return false;
6781
6782 Register N0 = AddrData.N0;
6783 Register N2 = AddrData.N2;
6784 Register N3 = AddrData.N3;
6785 Offset = AddrData.Offset;
6786
6787 // Base pointer for the SRD.
6788 Register SRDPtr;
6789
6790 if (N2) {
6791 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6792 assert(N3);
6793 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6794 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6795 // addr64, and construct the default resource from a 0 address.
6796 VAddr = N0;
6797 } else {
6798 SRDPtr = N3;
6799 VAddr = N2;
6800 }
6801 } else {
6802 // N2 is not divergent.
6803 SRDPtr = N2;
6804 VAddr = N3;
6805 }
6806 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6807 // Use the default null pointer in the resource
6808 VAddr = N0;
6809 } else {
6810 // N0 -> offset, or
6811 // (N0 + C1) -> offset
6812 SRDPtr = N0;
6813 }
6814
6815 MachineIRBuilder B(*Root.getParent());
6816 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6817 splitIllegalMUBUFOffset(B, SOffset, Offset);
6818 return true;
6819}
6820
6821bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6822 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6823 int64_t &Offset) const {
6824
6825 // FIXME: Pattern should not reach here.
6826 if (STI.useFlatForGlobal())
6827 return false;
6828
6829 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6830 if (shouldUseAddr64(AddrData))
6831 return false;
6832
6833 // N0 -> offset, or
6834 // (N0 + C1) -> offset
6835 Register SRDPtr = AddrData.N0;
6836 Offset = AddrData.Offset;
6837
6838 // TODO: Look through extensions for 32-bit soffset.
6839 MachineIRBuilder B(*Root.getParent());
6840
6841 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6842 splitIllegalMUBUFOffset(B, SOffset, Offset);
6843 return true;
6844}
6845
6847AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6848 Register VAddr;
6849 Register RSrcReg;
6850 Register SOffset;
6851 int64_t Offset = 0;
6852
6853 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6854 return {};
6855
6856 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6857 // pattern.
6858 return {{
6859 [=](MachineInstrBuilder &MIB) { // rsrc
6860 MIB.addReg(RSrcReg);
6861 },
6862 [=](MachineInstrBuilder &MIB) { // vaddr
6863 MIB.addReg(VAddr);
6864 },
6865 [=](MachineInstrBuilder &MIB) { // soffset
6866 if (SOffset)
6867 MIB.addReg(SOffset);
6868 else if (STI.hasRestrictedSOffset())
6869 MIB.addReg(AMDGPU::SGPR_NULL);
6870 else
6871 MIB.addImm(0);
6872 },
6873 [=](MachineInstrBuilder &MIB) { // offset
6874 MIB.addImm(Offset);
6875 },
6876 addZeroImm, // cpol
6877 addZeroImm, // tfe
6878 addZeroImm // swz
6879 }};
6880}
6881
6883AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6884 Register RSrcReg;
6885 Register SOffset;
6886 int64_t Offset = 0;
6887
6888 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6889 return {};
6890
6891 return {{
6892 [=](MachineInstrBuilder &MIB) { // rsrc
6893 MIB.addReg(RSrcReg);
6894 },
6895 [=](MachineInstrBuilder &MIB) { // soffset
6896 if (SOffset)
6897 MIB.addReg(SOffset);
6898 else if (STI.hasRestrictedSOffset())
6899 MIB.addReg(AMDGPU::SGPR_NULL);
6900 else
6901 MIB.addImm(0);
6902 },
6903 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6904 addZeroImm, // cpol
6905 addZeroImm, // tfe
6906 addZeroImm, // swz
6907 }};
6908}
6909
6911AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6912
6913 Register SOffset = Root.getReg();
6914
6915 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6916 SOffset = AMDGPU::SGPR_NULL;
6917
6918 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6919}
6920
6921/// Get an immediate that must be 32-bits, and treated as zero extended.
6922static std::optional<uint64_t>
6924 // getIConstantVRegVal sexts any values, so see if that matters.
6925 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6926 if (!OffsetVal || !isInt<32>(*OffsetVal))
6927 return std::nullopt;
6928 return Lo_32(*OffsetVal);
6929}
6930
6932AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6933 std::optional<uint64_t> OffsetVal =
6934 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6935 if (!OffsetVal)
6936 return {};
6937
6938 std::optional<int64_t> EncodedImm =
6939 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6940 if (!EncodedImm)
6941 return {};
6942
6943 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6944}
6945
6947AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6948 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6949
6950 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6951 if (!OffsetVal)
6952 return {};
6953
6954 std::optional<int64_t> EncodedImm =
6956 if (!EncodedImm)
6957 return {};
6958
6959 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6960}
6961
6963AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6964 // Match the (soffset + offset) pair as a 32-bit register base and
6965 // an immediate offset.
6966 Register SOffset;
6967 unsigned Offset;
6968 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6969 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6970 if (!SOffset)
6971 return std::nullopt;
6972
6973 std::optional<int64_t> EncodedOffset =
6974 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6975 if (!EncodedOffset)
6976 return std::nullopt;
6977
6978 assert(MRI->getType(SOffset) == LLT::scalar(32));
6979 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6980 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6981}
6982
6983std::pair<Register, unsigned>
6984AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6985 bool &Matched) const {
6986 Matched = false;
6987
6988 Register Src;
6989 unsigned Mods;
6990 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6991
6992 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6993 assert(MRI->getType(Src) == LLT::scalar(16));
6994
6995 // Only change Src if src modifier could be gained. In such cases new Src
6996 // could be sgpr but this does not violate constant bus restriction for
6997 // instruction that is being selected.
6998 Src = stripBitCast(Src, *MRI);
6999
7000 const auto CheckAbsNeg = [&]() {
7001 // Be careful about folding modifiers if we already have an abs. fneg is
7002 // applied last, so we don't want to apply an earlier fneg.
7003 if ((Mods & SISrcMods::ABS) == 0) {
7004 unsigned ModsTmp;
7005 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7006
7007 if ((ModsTmp & SISrcMods::NEG) != 0)
7008 Mods ^= SISrcMods::NEG;
7009
7010 if ((ModsTmp & SISrcMods::ABS) != 0)
7011 Mods |= SISrcMods::ABS;
7012 }
7013 };
7014
7015 CheckAbsNeg();
7016
7017 // op_sel/op_sel_hi decide the source type and source.
7018 // If the source's op_sel_hi is set, it indicates to do a conversion from
7019 // fp16. If the sources's op_sel is set, it picks the high half of the
7020 // source register.
7021
7022 Mods |= SISrcMods::OP_SEL_1;
7023
7024 if (isExtractHiElt(*MRI, Src, Src)) {
7025 Mods |= SISrcMods::OP_SEL_0;
7026 CheckAbsNeg();
7027 }
7028
7029 Matched = true;
7030 }
7031
7032 return {Src, Mods};
7033}
7034
7036AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7037 MachineOperand &Root) const {
7038 Register Src;
7039 unsigned Mods;
7040 bool Matched;
7041 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7042 if (!Matched)
7043 return {};
7044
7045 return {{
7046 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7047 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7048 }};
7049}
7050
7052AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7053 Register Src;
7054 unsigned Mods;
7055 bool Matched;
7056 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7057
7058 return {{
7059 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7060 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7061 }};
7062}
7063
7064bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7065 MachineInstr &I, Intrinsic::ID IntrID) const {
7066 MachineBasicBlock *MBB = I.getParent();
7067 const DebugLoc &DL = I.getDebugLoc();
7068 Register CCReg = I.getOperand(0).getReg();
7069
7070 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7071 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7072
7073 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7074 .addImm(I.getOperand(2).getImm());
7075
7076 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7077
7078 I.eraseFromParent();
7079 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7080 *MRI);
7081}
7082
7083bool AMDGPUInstructionSelector::selectSGetBarrierState(
7084 MachineInstr &I, Intrinsic::ID IntrID) const {
7085 MachineBasicBlock *MBB = I.getParent();
7086 const DebugLoc &DL = I.getDebugLoc();
7087 const MachineOperand &BarOp = I.getOperand(2);
7088 std::optional<int64_t> BarValImm =
7089 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7090
7091 if (!BarValImm) {
7092 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7093 .addReg(BarOp.getReg());
7094 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7095 }
7096 MachineInstrBuilder MIB;
7097 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7098 : AMDGPU::S_GET_BARRIER_STATE_M0;
7099 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7100
7101 auto DstReg = I.getOperand(0).getReg();
7102 const TargetRegisterClass *DstRC =
7103 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7104 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7105 return false;
7106 MIB.addDef(DstReg);
7107 if (BarValImm) {
7108 MIB.addImm(*BarValImm);
7109 }
7110 I.eraseFromParent();
7111 return true;
7112}
7113
7114unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7115 if (HasInlineConst) {
7116 switch (IntrID) {
7117 default:
7118 llvm_unreachable("not a named barrier op");
7119 case Intrinsic::amdgcn_s_barrier_join:
7120 return AMDGPU::S_BARRIER_JOIN_IMM;
7121 case Intrinsic::amdgcn_s_wakeup_barrier:
7122 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7123 case Intrinsic::amdgcn_s_get_named_barrier_state:
7124 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7125 };
7126 } else {
7127 switch (IntrID) {
7128 default:
7129 llvm_unreachable("not a named barrier op");
7130 case Intrinsic::amdgcn_s_barrier_join:
7131 return AMDGPU::S_BARRIER_JOIN_M0;
7132 case Intrinsic::amdgcn_s_wakeup_barrier:
7133 return AMDGPU::S_WAKEUP_BARRIER_M0;
7134 case Intrinsic::amdgcn_s_get_named_barrier_state:
7135 return AMDGPU::S_GET_BARRIER_STATE_M0;
7136 };
7137 }
7138}
7139
7140bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7141 MachineInstr &I, Intrinsic::ID IntrID) const {
7142 MachineBasicBlock *MBB = I.getParent();
7143 const DebugLoc &DL = I.getDebugLoc();
7144 const MachineOperand &BarOp = I.getOperand(1);
7145 const MachineOperand &CntOp = I.getOperand(2);
7146
7147 // A member count of 0 means "keep existing member count". That plus a known
7148 // constant value for the barrier ID lets us use the immarg form.
7149 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7150 std::optional<int64_t> CntImm =
7151 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7152 if (CntImm && *CntImm == 0) {
7153 std::optional<int64_t> BarValImm =
7154 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7155 if (BarValImm) {
7156 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7157 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7158 .addImm(BarID);
7159 I.eraseFromParent();
7160 return true;
7161 }
7162 }
7163 }
7164
7165 // BarID = (BarOp >> 4) & 0x3F
7166 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7167 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7168 .add(BarOp)
7169 .addImm(4u)
7170 .setOperandDead(3); // Dead scc
7171
7172 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7173 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7174 .addReg(TmpReg0)
7175 .addImm(0x3F)
7176 .setOperandDead(3); // Dead scc
7177
7178 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7179 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7180 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7181 .add(CntOp)
7182 .addImm(0x3F)
7183 .setOperandDead(3); // Dead scc
7184
7185 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7186 constexpr unsigned ShAmt = 16;
7187 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7188 .addReg(TmpReg2)
7189 .addImm(ShAmt)
7190 .setOperandDead(3); // Dead scc
7191
7192 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7193 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7194 .addReg(TmpReg1)
7195 .addReg(TmpReg3)
7196 .setOperandDead(3); // Dead scc;
7197
7198 auto CopyMIB =
7199 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7200 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7201
7202 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7203 ? AMDGPU::S_BARRIER_INIT_M0
7204 : AMDGPU::S_BARRIER_SIGNAL_M0;
7205 MachineInstrBuilder MIB;
7206 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7207
7208 I.eraseFromParent();
7209 return true;
7210}
7211
7212bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7213 MachineInstr &I, Intrinsic::ID IntrID) const {
7214 MachineBasicBlock *MBB = I.getParent();
7215 const DebugLoc &DL = I.getDebugLoc();
7216 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7217 ? I.getOperand(2)
7218 : I.getOperand(1);
7219 std::optional<int64_t> BarValImm =
7220 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7221
7222 if (!BarValImm) {
7223 // BarID = (BarOp >> 4) & 0x3F
7224 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7225 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7226 .addReg(BarOp.getReg())
7227 .addImm(4u)
7228 .setOperandDead(3); // Dead scc;
7229
7230 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7231 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7232 .addReg(TmpReg0)
7233 .addImm(0x3F)
7234 .setOperandDead(3); // Dead scc;
7235
7236 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7237 .addReg(TmpReg1);
7238 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7239 }
7240
7241 MachineInstrBuilder MIB;
7242 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7243 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7244
7245 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7246 auto DstReg = I.getOperand(0).getReg();
7247 const TargetRegisterClass *DstRC =
7248 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7249 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7250 return false;
7251 MIB.addDef(DstReg);
7252 }
7253
7254 if (BarValImm) {
7255 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7256 MIB.addImm(BarId);
7257 }
7258
7259 I.eraseFromParent();
7260 return true;
7261}
7262
7263void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7264 const MachineInstr &MI,
7265 int OpIdx) const {
7266 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7267 "Expected G_CONSTANT");
7268 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7269}
7270
7271void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7272 const MachineInstr &MI,
7273 int OpIdx) const {
7274 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7275 "Expected G_CONSTANT");
7276 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7277}
7278
7279void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7280 const MachineInstr &MI,
7281 int OpIdx) const {
7282 const MachineOperand &Op = MI.getOperand(1);
7283 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7284 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7285}
7286
7287void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7288 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7289 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7290 "Expected G_CONSTANT");
7291 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7292}
7293
7294/// This only really exists to satisfy DAG type checking machinery, so is a
7295/// no-op here.
7296void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7297 const MachineInstr &MI,
7298 int OpIdx) const {
7299 const MachineOperand &Op = MI.getOperand(OpIdx);
7300 int64_t Imm;
7301 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7302 MIB.addImm(Imm);
7303 else
7304 MIB.addImm(Op.getImm());
7305}
7306
7307void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7308 const MachineInstr &MI,
7309 int OpIdx) const {
7310 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7311}
7312
7313void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7314 const MachineInstr &MI,
7315 int OpIdx) const {
7316 assert(OpIdx >= 0 && "expected to match an immediate operand");
7317 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7318}
7319
7320void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7321 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7322 assert(OpIdx >= 0 && "expected to match an immediate operand");
7323 MIB.addImm(
7324 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7325}
7326
7327void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7328 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7329 assert(OpIdx >= 0 && "expected to match an immediate operand");
7330 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7332 : (int64_t)SISrcMods::DST_OP_SEL);
7333}
7334
7335void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7336 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7337 assert(OpIdx >= 0 && "expected to match an immediate operand");
7338 MIB.addImm(
7339 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7340}
7341
7342void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7343 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7344 assert(OpIdx >= 0 && "expected to match an immediate operand");
7345 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7346 ? (int64_t)(SISrcMods::OP_SEL_0)
7347 : 0);
7348}
7349
7350void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7351 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7352 assert(OpIdx >= 0 && "expected to match an immediate operand");
7353 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7354 : 0);
7355}
7356
7357void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7358 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7359 assert(OpIdx >= 0 && "expected to match an immediate operand");
7360 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7361 : 0);
7362}
7363
7364void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7365 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7366 assert(OpIdx >= 0 && "expected to match an immediate operand");
7367 MIB.addImm(
7368 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7369}
7370
7371void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7372 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7373 assert(OpIdx >= 0 && "expected to match an immediate operand");
7374 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7375 ? (int64_t)SISrcMods::DST_OP_SEL
7376 : 0);
7377}
7378
7379void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7380 const MachineInstr &MI,
7381 int OpIdx) const {
7382 assert(OpIdx >= 0 && "expected to match an immediate operand");
7383 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7386}
7387
7388void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7389 const MachineInstr &MI,
7390 int OpIdx) const {
7391 assert(OpIdx >= 0 && "expected to match an immediate operand");
7392 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7395 MIB.addImm(Swizzle);
7396}
7397
7398void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7399 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7400 assert(OpIdx >= 0 && "expected to match an immediate operand");
7401 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7404 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7405}
7406
7407void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7408 const MachineInstr &MI,
7409 int OpIdx) const {
7410 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7411}
7412
7413void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7414 const MachineInstr &MI,
7415 int OpIdx) const {
7416 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7417 int ExpVal = APF.getExactLog2Abs();
7418 assert(ExpVal != INT_MIN);
7419 MIB.addImm(ExpVal);
7420}
7421
7422void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7423 const MachineInstr &MI,
7424 int OpIdx) const {
7425 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7426 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7427 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7428 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7429 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7430}
7431
7432void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7433 const MachineInstr &MI,
7434 int OpIdx) const {
7435 unsigned Mods = SISrcMods::OP_SEL_1;
7436 if (MI.getOperand(OpIdx).getImm())
7437 Mods ^= SISrcMods::NEG;
7438 MIB.addImm((int64_t)Mods);
7439}
7440
7441void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7442 const MachineInstr &MI,
7443 int OpIdx) const {
7444 unsigned Mods = SISrcMods::OP_SEL_1;
7445 if (MI.getOperand(OpIdx).getImm())
7447 MIB.addImm((int64_t)Mods);
7448}
7449
7450void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7451 const MachineInstr &MI,
7452 int OpIdx) const {
7453 unsigned Val = MI.getOperand(OpIdx).getImm();
7454 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7455 if (Val == 1) // neg
7456 Mods ^= SISrcMods::NEG;
7457 if (Val == 2) // abs
7458 Mods ^= SISrcMods::ABS;
7459 if (Val == 3) // neg and abs
7460 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7461 MIB.addImm((int64_t)Mods);
7462}
7463
7464void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7465 const MachineInstr &MI,
7466 int OpIdx) const {
7467 uint32_t V = MI.getOperand(2).getImm();
7470 if (!Subtarget->hasSafeCUPrefetch())
7471 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7472 MIB.addImm(V);
7473}
7474
7475/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7476void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7477 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7478 unsigned Val = MI.getOperand(OpIdx).getImm();
7479 unsigned New = 0;
7480 if (Val & 0x1)
7482 if (Val & 0x2)
7484 MIB.addImm(New);
7485}
7486
7487bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7488 return TII.isInlineConstant(Imm);
7489}
7490
7491bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7492 return TII.isInlineConstant(Imm);
7493}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1564
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:857
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1423
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:293
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:493
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:313
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:438
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:468
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:500
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.